diff --git a/.gitignore b/.gitignore
index 7e21ba0b750dfc..65ba217de37c82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@ build/
*.user
.vscode
-.idea
\ No newline at end of file
+.idea
+.project
+.pydevproject
diff --git a/.travis.yml b/.travis.yml
index d3dae9efd416bd..bf0e0b7bbddd4c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,17 @@ language: cpp
cache: ccache
sudo: required
dist: trusty
+os:
+ - linux
+ - osx
env:
- JOB=DOCS
- JOB=BUILD_AND_TEST
+matrix:
+ exclude:
+ - os: osx
+ env: JOB=DOCS # Only generate documentation in linux
+
addons:
apt:
packages:
@@ -27,9 +35,11 @@ addons:
- libgoogle-glog-dev
- libgflags-dev
- libgtest-dev
+ - graphviz
before_install:
+ - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
+ - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- pip install wheel protobuf sphinx breathe recommonmark
- - sudo paddle/scripts/travis/before_install.sh
script:
- paddle/scripts/travis/main.sh
notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44e93f22c0eaf4..4613155f7700b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b1)
+set(PADDLE_PATCH_VERSION 0b2)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -104,7 +104,7 @@ else()
endif(NOT WITH_GPU)
if(WITH_DOUBLE)
- add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
+ add_definitions(-DPADDLE_TYPE_DOUBLE)
set(ACCURACY double)
else(WITH_DOUBLE)
set(ACCURACY float)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 529b4b9d15d097..57c32a54cd727e 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -17,10 +17,17 @@
## Find MKL First.
set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
-find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
+find_path(MKL_INCLUDE_DIR mkl.h PATHS
+ ${MKL_ROOT}/include)
+find_library(MKL_CORE_LIB NAMES mkl_core PATHS
+ ${MKL_ROOT}/lib
+ ${MKL_ROOT}/lib/intel64)
+find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
+ ${MKL_ROOT}/lib
+ ${MKL_ROOT}/lib/intel64)
+find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
+ ${MKL_ROOT}/lib
+ ${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index cc59309ee7efab..dbad6be3f41b3f 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -64,7 +64,9 @@ set(COMMON_FLAGS
-Wdelete-non-virtual-dtor
-Wno-unused-parameter
-Wno-error=literal-suffix
- -Wno-error=unused-local-typedefs)
+ -Wno-error=unused-local-typedefs
+ -Wno-error=unused-function # Warnings in Numpy Header.
+)
foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/util.cmake b/cmake/util.cmake
index d776c3ae499526..0fa36f070cc11b 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME)
cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
endmacro()
+
+
+# Creates C resources file from files in given resource file
+function(create_resources res_file output)
+ # Create empty output file
+ file(WRITE ${output} "")
+ # Get short filename
+ string(REGEX MATCH "([^/]+)$" filename ${res_file})
+ # Replace filename spaces & extension separator for C compatibility
+ string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
+ # Read hex data from file
+ file(READ ${res_file} filedata HEX)
+ # Convert hex data for C compatibility
+ string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
+ # Append data to output file
+ file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+endfunction()
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
new file mode 100644
index 00000000000000..810910fd5ca56f
--- /dev/null
+++ b/demo/mnist/.gitignore
@@ -0,0 +1,6 @@
+data/raw_data
+data/*.list
+mnist_vgg_model
+plot.png
+train.log
+*pyc
diff --git a/demo/mnist/data/generate_list.py b/demo/mnist/data/generate_list.py
new file mode 100644
index 00000000000000..1b929048b4d82b
--- /dev/null
+++ b/demo/mnist/data/generate_list.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+o = open("./" + "train.list", "w")
+o.write("./data/raw_data/train" +"\n")
+o.close()
+
+o = open("./" + "test.list", "w")
+o.write("./data/raw_data/t10k" +"\n")
+o.close()
\ No newline at end of file
diff --git a/demo/mnist/data/get_mnist_data.sh b/demo/mnist/data/get_mnist_data.sh
new file mode 100755
index 00000000000000..9099b5ab6fb85d
--- /dev/null
+++ b/demo/mnist/data/get_mnist_data.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env sh
+# This scripts downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/raw_data"
+mkdir "$DIR/raw_data"
+cd "$DIR/raw_data"
+
+echo "Downloading..."
+
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+ if [ ! -e $fname ]; then
+ wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+ gunzip ${fname}.gz
+ fi
+done
+
+cd $DIR
+rm -f *.list
+python generate_list.py
+
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
new file mode 100644
index 00000000000000..32af29730a7365
--- /dev/null
+++ b/demo/mnist/mnist_provider.py
@@ -0,0 +1,32 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types={
+ 'pixel': dense_vector(28 * 28),
+ 'label': integer_value(10)
+})
+def process(settings, filename): # settings is not used currently.
+ imgf = filename + "-images-idx3-ubyte"
+ labelf = filename + "-labels-idx1-ubyte"
+ f = open(imgf, "rb")
+ l = open(labelf, "rb")
+
+ f.read(16)
+ l.read(8)
+
+ # Define number of samples for train/test
+ if "train" in filename:
+ n = 60000
+ else:
+ n = 10000
+
+ for i in range(n):
+ label = ord(l.read(1))
+ pixels = []
+ for j in range(28 * 28):
+ pixels.append(float(ord(f.read(1))) / 255.0)
+ yield {"pixel": pixels, 'label': label}
+
+ f.close()
+ l.close()
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
new file mode 100755
index 00000000000000..084b32ac390b84
--- /dev/null
+++ b/demo/mnist/train.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+config=vgg_16_mnist.py
+output=./mnist_vgg_model
+log=train.log
+
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=0 \
+--trainer_count=1 \
+--num_passes=100 \
+--save_dir=$output \
+2>&1 | tee $log
+
+python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/mnist/vgg_16_mnist.py b/demo/mnist/vgg_16_mnist.py
new file mode 100644
index 00000000000000..45a45bb061aa78
--- /dev/null
+++ b/demo/mnist/vgg_16_mnist.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+is_predict = get_config_arg("is_predict", bool, False)
+
+####################Data Configuration ##################
+
+
+if not is_predict:
+ data_dir='./data/'
+ define_py_data_sources2(train_list= data_dir + 'train.list',
+ test_list= data_dir + 'test.list',
+ module='mnist_provider',
+ obj='process')
+
+######################Algorithm Configuration #############
+settings(
+ batch_size = 128,
+ learning_rate = 0.1 / 128.0,
+ learning_method = MomentumOptimizer(0.9),
+ regularization = L2Regularization(0.0005 * 128)
+)
+
+#######################Network Configuration #############
+
+data_size=1*28*28
+label_size=10
+img = data_layer(name='pixel', size=data_size)
+
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img,
+ num_channels=1,
+ num_classes=label_size)
+
+if not is_predict:
+ lbl = data_layer(name="label", size=label_size)
+ inputs(img, lbl)
+ outputs(classification_cost(input=predict, label=lbl))
+else:
+ outputs(predict)
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
index fb2bee98beb268..fe2acbbd74898f 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -20,6 +20,8 @@
set -e
+export LC_ALL=C
+
mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index 1f0a137c8bd594..ea4e32249a3d01 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
paddle train \
--config=$cfg \
--save_dir=./output \
diff --git a/demo/quick_start/trainer_config.bidi-lstm.py b/demo/quick_start/trainer_config.bidi-lstm.py
new file mode 100644
index 00000000000000..3be3d373422714
--- /dev/null
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
@@ -0,0 +1,62 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+ for i, line in enumerate(f):
+ w = line.strip().split()[0]
+ word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+ test_list=tst,
+ module="dataprovider_emb",
+ obj=process,
+ args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+ batch_size=batch_size,
+ learning_rate=2e-3,
+ learning_method=AdamOptimizer(),
+ regularization=L2Regularization(8e-4),
+ gradient_clipping_threshold=25
+)
+
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+
+bi_lstm = bidirectional_lstm(input=emb, size=128)
+dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+
+output = fc_layer(input=dropout, size=2,
+ bias_attr=bias_attr,
+ act=SoftmaxActivation())
+
+if is_predict:
+ maxid = maxid_layer(output)
+ outputs([maxid, output])
+else:
+ label = data_layer(name="label", size=2)
+ cls = classification_cost(input=output, label=label)
+ outputs(cls)
diff --git a/demo/quick_start/trainer_config.db-lstm.py b/demo/quick_start/trainer_config.db-lstm.py
new file mode 100644
index 00000000000000..b35bdf5a61b473
--- /dev/null
+++ b/demo/quick_start/trainer_config.db-lstm.py
@@ -0,0 +1,73 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+ for i, line in enumerate(f):
+ w = line.strip().split()[0]
+ word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+ test_list=tst,
+ module="dataprovider_emb",
+ obj=process,
+ args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+ batch_size=batch_size,
+ learning_rate=2e-3,
+ learning_method=AdamOptimizer(),
+ regularization=L2Regularization(8e-4),
+ gradient_clipping_threshold=25
+)
+
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+
+hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
+lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
+
+input_layers = [hidden_0, lstm_0]
+
+for i in range(1,8):
+ fc = fc_layer(input=input_layers, size=128)
+ lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
+ reverse=(i % 2) == 1,)
+ input_layers = [fc, lstm]
+
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+
+output = fc_layer(input=lstm_last, size=2,
+ bias_attr=bias_attr,
+ act=SoftmaxActivation())
+
+if is_predict:
+ maxid = maxid_layer(output)
+ outputs([maxid, output])
+else:
+ label = data_layer(name="label", size=2)
+ cls = classification_cost(input=output, label=label)
+ outputs(cls)
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index 2b0c3f34648b05..edd6ad3f739b6c 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj:
- encoded_proj += full_matrix_projection(encoded_vector)
+ encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward)
with mixed_layer(size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
- decoder_boot += full_matrix_projection(backward_first)
+ decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder',
@@ -113,8 +113,8 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
- decoder_inputs += full_matrix_projection(context)
- decoder_inputs += full_matrix_projection(current_word)
+ decoder_inputs += full_matrix_projection(input=context)
+ decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs,
diff --git a/demo/sequence_tagging/data/get_data.sh b/demo/sequence_tagging/data/get_data.sh
new file mode 100755
index 00000000000000..e579d6c46ce5ed
--- /dev/null
+++ b/demo/sequence_tagging/data/get_data.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
+wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
diff --git a/demo/sequence_tagging/data/test.list b/demo/sequence_tagging/data/test.list
new file mode 100644
index 00000000000000..073c0a0c9063ac
--- /dev/null
+++ b/demo/sequence_tagging/data/test.list
@@ -0,0 +1 @@
+data/test.txt.gz
diff --git a/demo/sequence_tagging/data/train.list b/demo/sequence_tagging/data/train.list
new file mode 100644
index 00000000000000..43c24d5f6484a9
--- /dev/null
+++ b/demo/sequence_tagging/data/train.list
@@ -0,0 +1 @@
+data/train.txt.gz
diff --git a/demo/sequence_tagging/dataprovider.py b/demo/sequence_tagging/dataprovider.py
new file mode 100644
index 00000000000000..6f412d6834be6d
--- /dev/null
+++ b/demo/sequence_tagging/dataprovider.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import gzip
+import logging
+
+logging.basicConfig(
+ format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
+)
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+
+OOV_POLICY_IGNORE = 0
+OOV_POLICY_USE = 1
+OOV_POLICY_ERROR = 2
+
+num_original_columns = 3
+
+# Feature combination patterns.
+# [[-1,0], [0,0]] means previous token at column 0 and current token at
+# column 0 are combined as one feature.
+patterns = [
+ [[-2,0]],
+ [[-1,0]],
+ [[0,0]],
+ [[1,0]],
+ [[2,0]],
+
+ [[-1,0], [0,0]],
+ [[0,0], [1,0]],
+
+ [[-2,1]],
+ [[-1,1]],
+ [[0,1]],
+ [[1,1]],
+ [[2,1]],
+ [[-2,1], [-1,1]],
+ [[-1,1], [0,1]],
+ [[0,1], [1,1]],
+ [[1,1], [2,1]],
+
+ [[-2,1], [-1,1], [0,1]],
+ [[-1,1], [0,1], [1,1]],
+ [[0,1], [1,1], [2,1]],
+]
+
+dict_label = {
+ 'B-ADJP': 0,
+ 'I-ADJP': 1,
+ 'B-ADVP': 2,
+ 'I-ADVP': 3,
+ 'B-CONJP': 4,
+ 'I-CONJP': 5,
+ 'B-INTJ': 6,
+ 'I-INTJ': 7,
+ 'B-LST': 8,
+ 'I-LST': 9,
+ 'B-NP': 10,
+ 'I-NP': 11,
+ 'B-PP': 12,
+ 'I-PP': 13,
+ 'B-PRT': 14,
+ 'I-PRT': 15,
+ 'B-SBAR': 16,
+ 'I-SBAR': 17,
+ 'B-UCP': 18,
+ 'I-UCP': 19,
+ 'B-VP': 20,
+ 'I-VP': 21,
+ 'O': 22
+}
+
+def make_features(sequence):
+ length = len(sequence)
+ num_features = len(sequence[0])
+ def get_features(pos):
+ if pos < 0:
+ return ['#B%s' % -pos] * num_features
+ if pos >= length:
+ return ['#E%s' % (pos - length + 1)] * num_features
+ return sequence[pos]
+
+ for i in xrange(length):
+ for pattern in patterns:
+ fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
+ sequence[i].append(fname)
+
+'''
+Source file format:
+Each line is for one timestep. The features are separated by space.
+An empty line indicates end of a sequence.
+
+cutoff: a list of numbers. If count of a feature is smaller than this,
+ it will be ignored.
+if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
+i-th column.
+
+return a list of dict for each column
+'''
+def create_dictionaries(filename, cutoff, oov_policy):
+ def add_to_dict(sequence, dicts):
+ num_features = len(dicts)
+ for features in sequence:
+ l = len(features)
+ assert l == num_features, "Wrong number of features " + line
+ for i in xrange(l):
+ if features[i] in dicts[i]:
+ dicts[i][features[i]] += 1
+ else:
+ dicts[i][features[i]] = 1
+
+ num_features = len(cutoff)
+ dicts = []
+ for i in xrange(num_features):
+ dicts.append(dict())
+
+ f = gzip.open(filename, 'rb')
+
+ sequence = []
+
+ for line in f:
+ line = line.strip()
+ if not line:
+ make_features(sequence)
+ add_to_dict(sequence, dicts)
+ sequence = []
+ continue
+ features = line.split(' ')
+ sequence.append(features)
+
+
+ for i in xrange(num_features):
+ dct = dicts[i]
+ n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
+ todo = []
+ for k, v in dct.iteritems():
+ if v < cutoff[i]:
+ todo.append(k)
+ else:
+ dct[k] = n
+ n += 1
+
+ if oov_policy[i] == OOV_POLICY_USE:
+ # placeholder so that len(dct) will be the number of features
+ # including OOV
+ dct['#OOV#'] = 0
+
+ logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
+ for k in todo:
+ del dct[k]
+
+ f.close()
+ return dicts
+
+
+def initializer(settings, **xargs):
+ cutoff = [3, 1, 0]
+ cutoff += [3] * len(patterns)
+ oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
+ oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
+ dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
+ dicts[2] = dict_label
+ settings.dicts = dicts
+ settings.oov_policy = oov_policy
+ input_types = []
+ num_features = len(dicts)
+ for i in xrange(num_original_columns):
+ input_types.append(integer_sequence(len(dicts[i])))
+ logger.info("slot %s size=%s" % (i, len(dicts[i])))
+ if patterns:
+ dim = 0
+ for i in xrange(num_original_columns, num_features):
+ dim += len(dicts[i])
+ input_types.append(sparse_binary_vector_sequence(dim))
+ logger.info("feature size=%s" % dim)
+ settings.input_types = input_types
+
+'''
+if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
+existed in dicts[i] will be assigned to id 0.
+if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
+in dicts[i].
+'''
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, filename):
+ input_file = filename
+ dicts = settings.dicts
+ oov_policy = settings.oov_policy
+
+ def gen_sample(sequence):
+ num_features = len(dicts)
+ sample = [list() for i in xrange(num_original_columns)]
+ if patterns:
+ sample.append([])
+ for features in sequence:
+ assert len(features) == num_features, \
+ "Wrong number of features: " + line
+ for i in xrange(num_original_columns):
+ id = dicts[i].get(features[i], -1)
+ if id != -1:
+ sample[i].append(id)
+ elif oov_policy[i] == OOV_POLICY_IGNORE:
+ sample[i].append(0xffffffff)
+ elif oov_policy[i] == OOV_POLICY_ERROR:
+ logger.fatal("Unknown token: %s" % features[i])
+ else:
+ sample[i].append(0)
+
+ if patterns:
+ dim = 0
+ vec = []
+ for i in xrange(num_original_columns, num_features):
+ id = dicts[i].get(features[i], -1)
+ if id != -1:
+ vec.append(dim + id)
+ elif oov_policy[i] == OOV_POLICY_IGNORE:
+ pass
+ elif oov_policy[i] == OOV_POLICY_ERROR:
+ logger.fatal("Unknown token: %s" % features[i])
+ else:
+ vec.ids.append(dim + 0)
+
+ dim += len(dicts[i])
+ sample[-1].append(vec)
+ return sample
+
+ num_features = len(dicts)
+ f = gzip.open(input_file, 'rb')
+
+ num_sequences = 0
+ sequence = []
+ for line in f:
+ line = line.strip()
+ if not line:
+ make_features(sequence)
+ yield gen_sample(sequence)
+ sequence = []
+ num_sequences += 1
+ continue
+ features = line.split(' ')
+ sequence.append(features)
+
+ f.close()
+
+ logger.info("num_sequences=%s" % num_sequences)
+
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
new file mode 100644
index 00000000000000..2bd1a20bc52fc5
--- /dev/null
+++ b/demo/sequence_tagging/linear_crf.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+import math
+
+define_py_data_sources2(train_list="data/train.list",
+ test_list="data/test.list",
+ module="dataprovider",
+ obj="process")
+
+
+batch_size = 1
+settings(
+ learning_method=MomentumOptimizer(),
+ batch_size=batch_size,
+ regularization=L2Regularization(batch_size * 1e-4),
+ average_window=0.5,
+ learning_rate=1e-1,
+ learning_rate_decay_a=1e-5,
+ learning_rate_decay_b=0.25,
+)
+
+num_label_types=23
+
+def get_simd_size(size):
+ return int(math.ceil(float(size) / 8)) * 8
+
+# Currently, in order to use sparse_update=True,
+# the size has to be aligned.
+num_label_types = get_simd_size(num_label_types)
+
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+ size=num_label_types)
+
+crf_input = fc_layer(
+ input=features,
+ size=num_label_types,
+ act=LinearActivation(),
+ bias_attr=False,
+ param_attr=ParamAttr(initial_std=0, sparse_update=True))
+
+crf=crf_layer(
+ input=crf_input,
+ label=chunk,
+ param_attr=ParamAttr(name="crfw", initial_std=0),
+)
+
+crf_decoding=crf_decoding_layer(
+ size=num_label_types,
+ input=crf_input,
+ label=chunk,
+ param_attr=ParamAttr(name="crfw"),
+)
+
+sum_evaluator(
+ name="error",
+ input=crf_decoding,
+)
+
+chunk_evaluator(
+ name="chunk_f1",
+ input =[crf_decoding, chunk],
+ chunk_scheme="IOB",
+ num_chunk_types=11,
+)
+
+inputs(word, pos, chunk, features)
+outputs(crf)
diff --git a/demo/sequence_tagging/readme.md b/demo/sequence_tagging/readme.md
new file mode 100644
index 00000000000000..2e17fffb83c532
--- /dev/null
+++ b/demo/sequence_tagging/readme.md
@@ -0,0 +1,45 @@
+# Sequence Tagging
+
+This demo is a sequence model for assigning tags to each token in a sentence. The task is described at CONLL2000 Text Chunking task.
+
+## Download data
+```bash
+cd demo/sequence_tagging
+./data/get_data.sh
+```
+
+## Train model
+```bash
+cd demo/sequence_tagging
+./train.sh
+```
+
+## Model description
+
+We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at leon.bottou.org/projects/sgd. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
+
+
+
+
+Model name |
+Number of parameters |
+F1 score |
+
+
+
+
+linear_crf |
+ 1.8M |
+ 0.937 |
+
+
+
+rnn_crf |
+ 960K |
+0.941 |
+
+
+
+
+
+
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
new file mode 100644
index 00000000000000..fb157bf3ea7193
--- /dev/null
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+import math
+
+define_py_data_sources2(train_list="data/train.list",
+ test_list="data/test.list",
+ module="dataprovider",
+ obj="process")
+
+batch_size = 16
+settings(
+ learning_method=MomentumOptimizer(),
+ batch_size=batch_size,
+ regularization=L2Regularization(batch_size * 1e-5),
+ average_window=0.5,
+ learning_rate = 2e-3,
+ learning_rate_decay_a = 5e-7,
+ learning_rate_decay_b = 0.5,
+)
+
+word_dim=128
+hidden_dim = 128
+with_rnn = True
+
+initial_std=1/math.sqrt(hidden_dim)
+param_attr=ParamAttr(initial_std=initial_std)
+cpu_layer_attr=ExtraLayerAttribute(device=-1)
+
+default_device(0)
+
+num_label_types=23
+
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+ size=num_label_types,
+ layer_attr=cpu_layer_attr)
+
+emb = embedding_layer(
+ input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
+
+hidden1 = mixed_layer(
+ size=hidden_dim,
+ act=STanhActivation(),
+ bias_attr=True,
+ input=[full_matrix_projection(emb),
+ table_projection(pos, param_attr=param_attr)]
+)
+
+if with_rnn:
+ rnn1 = recurrent_layer(
+ act=ReluActivation(),
+ bias_attr=True,
+ input=hidden1,
+ param_attr=ParamAttr(initial_std=0),
+ )
+
+hidden2 = mixed_layer(
+ size=hidden_dim,
+ act=STanhActivation(),
+ bias_attr=True,
+ input=[full_matrix_projection(hidden1)
+ ] + ([
+ full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
+ ] if with_rnn else []),
+)
+
+if with_rnn:
+ rnn2=recurrent_layer(
+ reverse=True,
+ act=ReluActivation(),
+ bias_attr=True,
+ input=hidden2,
+ param_attr=ParamAttr(initial_std=0),
+ )
+
+crf_input = mixed_layer(
+ size=num_label_types,
+ bias_attr=False,
+ input=[
+ full_matrix_projection(hidden2),
+ ] + ([
+ full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
+ ] if with_rnn else []),
+)
+
+crf = crf_layer(
+ input=crf_input,
+ label=chunk,
+ param_attr=ParamAttr(name="crfw", initial_std=0),
+ layer_attr=cpu_layer_attr,
+)
+
+crf_decoding = crf_decoding_layer(
+ size=num_label_types,
+ input=crf_input,
+ label=chunk,
+ param_attr=ParamAttr(name="crfw"),
+ layer_attr=cpu_layer_attr,
+)
+
+sum_evaluator(
+ name="error",
+ input=crf_decoding,
+)
+
+chunk_evaluator(
+ name="chunk_f1",
+ input =[crf_decoding, chunk],
+ chunk_scheme="IOB",
+ num_chunk_types=11,
+)
+
+inputs(word, pos, chunk, features)
+outputs(crf)
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
new file mode 100755
index 00000000000000..9a706b98d86861
--- /dev/null
+++ b/demo/sequence_tagging/train.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+paddle train \
+ --config rnn_crf.py \
+ --parallel_nn=1 \
+ --use_gpu=1 \
+ --dot_period=10 \
+ --log_period=1000 \
+ --test_period=0 \
+ --num_passes=10
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
new file mode 100755
index 00000000000000..597b5afea9c63a
--- /dev/null
+++ b/demo/sequence_tagging/train_linear.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+paddle train \
+ --config linear_crf.py \
+ --use_gpu=0 \
+ --dot_period=100 \
+ --log_period=10000 \
+ --test_period=0 \
+ --num_passes=10
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index 06fcff61720755..bbdbb4d4227d0b 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD
git push -f origin HEAD
```
Now your Pull Request is updated with the latest version.
+
+## Revise your pull request
+
+When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
diff --git a/doc/build/docker_install.rst b/doc/build/docker_install.rst
index 542b9bac27afb8..e95de35f4da35f 100644
--- a/doc/build/docker_install.rst
+++ b/doc/build/docker_install.rst
@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
.. code-block:: bash
- export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
index ee3fa2a2166f49..e7d74512292c89 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -134,7 +134,7 @@ def process(settings, file_name):
You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
- The path of the training and testing data (`data/train.list`, `data/test.list`).
-- The location of the data provider file (`dataprovider_pow`).
+- The location of the data provider file (`dataprovider_bow`).
- The function to call to get data. (`process`).
- Additional arguments or data. Here it passes the path of word dictionary.
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index 01443466105b5b..ab27c3bd6e8ad7 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -73,6 +73,12 @@ img_pool_layer
:members: img_pool_layer
:noindex:
+maxout_layer
+------------
+.. automodule:: paddle.trainer_config_helpers.layers
+ :members: maxout_layer
+ :noindex:
+
Norm Layer
==========
@@ -130,6 +136,12 @@ gru_step_layer
Recurrent Layer Group
=====================
+memory
+------
+.. automodule:: paddle.trainer_config_helpers.layers
+ :members: memory
+ :noindex:
+
recurrent_group
---------------
.. automodule:: paddle.trainer_config_helpers.layers
@@ -377,6 +389,12 @@ ctc_layer
:members: ctc_layer
:noindex:
+nce_layer
+-----------
+.. automodule:: paddle.trainer_config_helpers.layers
+ :members: nce_layer
+ :noindex:
+
hsigmoid
---------
.. automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.md
new file mode 100644
index 00000000000000..5282bbbcb82d00
--- /dev/null
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
@@ -0,0 +1,66 @@
+# 支持双层序列作为输入的Layer
+
+## 概述
+
+在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。
+
+双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。
+
+我们可以按照如下层次定义非序列,单层序列,以及双层序列。
+
++ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型
++ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息
++ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列
+
+
+在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。
+## pooling_layer
+
+pooling_layer的使用示例如下,详细见配置API。
+```python
+seq_pool = pooling_layer(input=layer,
+ pooling_type=AvgPooling(),
+ agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。
+- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
+ - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列
+ - 输入:一个双层序列,或一个单层序列
+ - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值)
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
+ - 作用:一个双层序列经过运算变成一个单层序列
+ - 输入:必须是一个双层序列
+ - 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值)
+
+## last_seq 和 first_seq
+
+last_seq的使用示例如下(first_seq类似),详细见配置API。
+```python
+last = last_seq(input=layer,
+ agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
+ - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列
+ - 输入:一个双层序列或一个单层序列
+ - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
+ - 作用:一个双层序列经过运算变成一个单层序列
+ - 输入:必须是一个双层序列
+ - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。
+
+## expand_layer
+
+expand_layer的使用示例如下,详细见配置API。
+```python
+expand = expand_layer(input=layer1,
+ expand_as=layer2,
+ expand_level=ExpandLevel.FROM_TIMESTEP)
+```
+- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值):
+ - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列
+ - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
+ - 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
+- `expand_level=ExpandLevel.FROM_SEQUENCE`时:
+ - 作用:一个单层序列经过运算扩展成一个双层序列
+ - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息
+ - 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。
\ No newline at end of file
diff --git a/doc_cn/algorithm/rnn/hierarchical-rnn.md b/doc_cn/algorithm/rnn/hierarchical-rnn.md
new file mode 100644
index 00000000000000..4a85cf336146ef
--- /dev/null
+++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md
@@ -0,0 +1,403 @@
+# 双层RNN配置与示例
+
+我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中,通过多组语义相同的单双层RNN配置,讲解如何使用双层RNN。
+
+## 示例1:双进双出,subseq间无memory
+
+配置:单层RNN(`sequence_layer_group`)和双层RNN(`sequence_nest_layer_group`),语义完全相同。
+
+### 读取双层序列的方法
+
+首先,我们看一下单双层序列的不同数据组织形式(您也可以采用别的组织形式):
+
+- 单层序列的数据(`Sequence/tour_train_wdseg`)如下,一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。
+
+```text
+2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。
+2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 *
+2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。
+2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 .
+2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 !
+2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 *
+2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格
+```
+
+- 双层序列的数据(`Sequence/tour_train_wdseg.nest`)如下,一共有4个样本。样本间用空行分开,代表不同的双层序列,序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
+
+```text
+2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。
+2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 *
+
+2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。
+2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 .
+
+2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 !
+
+2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 *
+2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格
+```
+
+其次,我们看一下单双层序列的不同dataprovider(见`sequenceGen.py`):
+
+- 单层序列的dataprovider如下:
+ - word_slot是integer_value_sequence类型,代表单层序列。
+ - label是integer_value类型,代表一个向量。
+
+```python
+def hook(settings, dict_file, **kwargs):
+ settings.word_dict = dict_file
+ settings.input_types = [integer_value_sequence(len(settings.word_dict)),
+ integer_value(3)]
+
+@provider(init_hook=hook)
+def process(settings, file_name):
+ with open(file_name, 'r') as fdata:
+ for line in fdata:
+ label, comment = line.strip().split('\t')
+ label = int(''.join(label.split()))
+ words = comment.split()
+ word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+ yield word_slot, label
+```
+
+- 双层序列的dataprovider如下:
+ - word_slot是integer_value_sub_sequence类型,代表双层序列。
+ - label是integer_value_sequence类型,代表单层序列,即一个子句一个label。注意:也可以为integer_value类型,代表一个向量,即一个句子一个label。通常根据任务需求进行不同设置。
+ - 关于dataprovider中input_types的详细用法,参见PyDataProvider2。
+
+```python
+def hook2(settings, dict_file, **kwargs):
+ settings.word_dict = dict_file
+ settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
+ integer_value_sequence(3)]
+
+@provider(init_hook=hook2)
+def process2(settings, file_name):
+ with open(file_name) as fdata:
+ label_list = []
+ word_slot_list = []
+ for line in fdata:
+ if (len(line)) > 1:
+ label,comment = line.strip().split('\t')
+ label = int(''.join(label.split()))
+ words = comment.split()
+ word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+ label_list.append(label)
+ word_slot_list.append(word_slot)
+ else:
+ yield word_slot_list, label_list
+ label_list = []
+ word_slot_list = []
+```
+
+### 模型中的配置
+
+首先,我们看一下单层序列的配置(见`sequence_layer_group.conf`)。注意:batchsize=5表示一次过5句单层序列,因此2个batch就可以完成1个pass。
+
+```python
+settings(batch_size=5)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory
+with mixed_layer(size=hidden_dim*4) as lstm_input:
+ lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(input=lstm_input,
+ size=hidden_dim,
+ act=TanhActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=TanhActivation(),
+ lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(size=label_dim,
+ act=SoftmaxActivation(),
+ bias_attr=True) as output:
+ output += full_matrix_projection(input=lstm_last)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+
+```
+其次,我们看一下语义相同的双层序列配置(见`sequence_nest_layer_group.conf`),并对其详细分析:
+
+- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知,2句双层序列和5句单层序列的数据完全一样。
+- data_layer和embedding_layer不关心数据是否是序列格式,因此两个配置在这两层上的输出是一样的。
+- lstmemory:
+ - 单层序列过了一个mixed_layer和lstmemory_group。
+ - 双层序列在同样的mixed_layer和lstmemory_group外,直接加了一层group。由于这个外层group里面没有memory,表示subseq间不存在联系,即起到的作用仅仅是把双层seq拆成单层,因此双层序列过完lstmemory的输出和单层的一样。
+- last_seq:
+ - 单层序列直接取了最后一个元素
+ - 双层序列首先(last_seq层)取了每个subseq的最后一个元素,将其拼接成一个新的单层序列;接着(expand_layer层)将其扩展成一个新的双层序列,其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量;最后(average_layer层)取了每个subseq的平均值。
+ - 分析得出:第一个last_seq后,每个subseq的最后一个元素就等于单层序列的最后一个元素,而expand_layer和average_layer后,依然保持每个subseq最后一个元素的值不变(这两层仅是为了展示它们的用法,实际中并不需要)。因此单双层序列的输出是一样旳。
+
+```python
+settings(batch_size=2)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory
+def lstm_group(lstm_group_input):
+ with mixed_layer(size=hidden_dim*4) as group_input:
+ group_input += full_matrix_projection(input=lstm_group_input)
+
+ lstm_output = lstmemory_group(input=group_input,
+ name="lstm_group",
+ size=hidden_dim,
+ act=TanhActivation(),
+ gate_act=SigmoidActivation(),
+ state_act=TanhActivation(),
+ lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+ return lstm_output
+
+lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
+ step=lstm_group,
+ name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(input=lstm_expand,
+ pooling_type=AvgPooling(),
+ agg_level=AggregateLevel.EACH_SEQUENCE)
+
+with mixed_layer(size=label_dim,
+ act=SoftmaxActivation(),
+ bias_attr=True) as output:
+ output += full_matrix_projection(input=lstm_average)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+```
+## 示例2:双进双出,subseq间有memory
+
+配置:单层RNN(`sequence_rnn.conf`),双层RNN(`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`),语义完全相同。
+
+### 读取双层序列的方法
+
+我们看一下单双层序列的不同数据组织形式和dataprovider(见`rnn_data_provider.py`)
+```python
+data = [
+ [[[1, 3, 2], [4, 5, 2]], 0],
+ [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+ integer_value(3)])
+def process_subseq(settings, file_name):
+ for d in data:
+ yield d
+
+@provider(input_types=[integer_value_sequence(10),
+ integer_value(3)])
+def process_seq(settings, file_name):
+ for d in data:
+ seq = []
+```
+- 单层序列:有两句,分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
+- 双层序列:有两句,分别为[[1,3,2],[4,5,2]](2个子句)和[[0,2],[2,5],[0,1,2]](3个子句)。
+- 单双层序列的label都分别是0和1
+
+### 模型中的配置
+
+我们选取单双层序列配置中的不同部分,来对比分析两者语义相同的原因。
+
+- 单层序列:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+
+```python
+def step(y):
+ mem = memory(name="rnn_state", size=hidden_dim)
+ return fc_layer(input=[y, mem],
+ size=hidden_dim,
+ act=TanhActivation(),
+ bias_attr=True,
+ name="rnn_state")
+
+out = recurrent_group(step=step, input=emb)
+```
+- 双层序列,外层memory是一个元素:
+ - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。
+ - 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
+
+```python
+def outer_step(x):
+ outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+ def inner_step(y):
+ inner_mem = memory(name="inner_rnn_state",
+ size=hidden_dim,
+ boot_layer=outer_mem)
+ return fc_layer(input=[y, inner_mem],
+ size=hidden_dim,
+ act=TanhActivation(),
+ bias_attr=True,
+ name="inner_rnn_state")
+
+ inner_rnn_output = recurrent_group(
+ step=inner_step,
+ input=x)
+ last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+ return inner_rnn_output
+
+out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
+```
+- 双层序列,外层memory是单层序列:
+ - 由于外层每个时间步返回的是一个子句,这些子句的长度往往不等长。因此当外层有is_seq=True的memory时,内层是**无法直接使用**它的,即内层memory的boot_layer不能链接外层的这个memory。
+ - 如果内层memory想**间接使用**这个外层memory,只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下,外层memory必须有boot_layer,否则在第0个时间步时,由于外层memory没有任何seq信息,因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
+
+## 示例3:双进双出,输入不等长
+
+**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input,用targetInlink表示。参考配置:单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`),双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`)
+
+### 读取双层序列的方法
+
+我们看一下单双层序列的数据组织形式和dataprovider(见`rnn_data_provider.py`)
+```python
+data2 = [
+ [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
+ [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+ integer_value_sub_sequence(10),
+ integer_value(2)],
+ should_shuffle=False)
+def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
+ for d in data2:
+ yield d
+
+
+@provider(input_types=[integer_value_sequence(10),
+ integer_value_sequence(10),
+ integer_value(2)],
+ should_shuffle=False)
+def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
+ for d in data2:
+ words1=reduce(lambda x,y: x+y, d[0])
+ words2=reduce(lambda x,y: x+y, d[1])
+ yield words1, words2, d[2]
+```
+
+data2 中有两个样本,每个样本有两个特征, 记fea1, fea2。
+
+- 单层序列:两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
+- 双层序列:两个样本分别为
+ - **样本1**:[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句,fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
+ - **样本2**:[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句, fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。
+ - **注意**:每个样本中,各特征的子句数目需要相等。这里说的“双进双出,输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本,时刻i=2, fea1[2]=[4, 5, 2],fea2[2]=[3, 1],3≠2。
+- 单双层序列中,两个样本的label都分别是0和1
+
+### 模型中的配置
+
+单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`)和双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`)两个模型配置达到的效果完全一样,区别只在于输入为单层还是双层序列,现在我们来看它们内部分别是如何实现的。
+
+- 单层序列:
+ - 过了一个简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全连接,功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里,两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列,最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+ - 注意到这里recurrent_group输入的每个样本中,fea1和fea2的长度都分别相等,这并非偶然,而是因为recurrent_group要求输入为单层序列时,所有输入的长度都必须相等。
+
+```python
+def step(x1, x2):
+ def calrnn(y):
+ mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
+ out = fc_layer(input = [y, mem],
+ size = hidden_dim,
+ act = TanhActivation(),
+ bias_attr = True,
+ name = 'rnn_state_' + y.name)
+ return out
+
+ encoder1 = calrnn(x1)
+ encoder2 = calrnn(x2)
+ return [encoder1, encoder2]
+
+encoder1_rep, encoder2_rep = recurrent_group(
+ name="stepout",
+ step=step,
+ input=[emb1, emb2])
+
+encoder1_last = last_seq(input = encoder1_rep)
+encoder1_expandlast = expand_layer(input = encoder1_last,
+ expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+ identity_projection(encoder2_rep)],
+ size = hidden_dim)
+```
+- 双层序列:
+ - 双层RNN中,对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2),其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是,此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
+ - 函数`outer_step`中可以分别处理这两个特征,但我们需要用targetInlink指定recurrent_group的输出的格式(各子句长度)只能和其中一个保持一致,如这里选择了和emb2的长度一致。
+ - 最后,依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+
+```python
+def outer_step(x1, x2):
+ outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
+ outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
+ def inner_step1(y):
+ inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+ size = hidden_dim,
+ boot_layer = outer_mem1)
+ out = fc_layer(input = [y, inner_mem],
+ size = hidden_dim,
+ act = TanhActivation(),
+ bias_attr = True,
+ name = 'inner_rnn_state_' + y.name)
+ return out
+
+ def inner_step2(y):
+ inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+ size = hidden_dim,
+ boot_layer = outer_mem2)
+ out = fc_layer(input = [y, inner_mem],
+ size = hidden_dim,
+ act = TanhActivation(),
+ bias_attr = True,
+ name = 'inner_rnn_state_' + y.name)
+ return out
+
+ encoder1 = recurrent_group(
+ step = inner_step1,
+ name = 'inner1',
+ input = x1)
+
+ encoder2 = recurrent_group(
+ step = inner_step2,
+ name = 'inner2',
+ input = x2)
+
+ sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
+ sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
+
+ encoder1_expand = expand_layer(input = sentence_last_state1,
+ expand_as = encoder2)
+
+ return [encoder1_expand, encoder2]
+
+encoder1_rep, encoder2_rep = recurrent_group(
+ name="outer",
+ step=outer_step,
+ input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+ targetInlink=emb2)
+
+encoder1_last = last_seq(input = encoder1_rep)
+encoder1_expandlast = expand_layer(input = encoder1_last,
+ expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+ identity_projection(encoder2_rep)],
+ size = hidden_dim)
+```
+
+## 示例4:beam_search的生成
+
+TBD
\ No newline at end of file
diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc_cn/algorithm/rnn/rnn-tutorial.md
new file mode 100644
index 00000000000000..7a553054c80392
--- /dev/null
+++ b/doc_cn/algorithm/rnn/rnn-tutorial.md
@@ -0,0 +1,96 @@
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列,多句话进一步构成了段落。因此,段落可以看作是一个嵌套的双层的序列,这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式,帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入,我们可以设计搭建一个灵活的、层次化的RNN,分别从词语和句子级别编码输入数据,同时也能够引入更加复杂的记忆机制,更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中,`recurrent_group`是一种任意复杂的RNN单元,用户只需定义RNN在一个时间步内完成的计算,PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步,`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算,最终实现一个层次化的复杂RNN。
+
+目前,在PaddlePaddle中,能够对双向序列进行处理的有`recurrent_group`和部分Layer,具体可参考文档:支持双层序列作为输入的Layer。
+
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算,PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中,`recurrent_group`的一个简单调用如下:
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step:一个可调用的函数,定义一个时间步之内RNN单元完成的计算
+- input:输入,必须是一个单层序列,或者一个双层序列
+- reverse:是否以逆序处理输入序列
+
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer,完成任意的运算逻辑。`recurrent_group` 的输入(即input)会成为step函数的输入,由于step 函数只关注于RNN一个时间步之内的计算,在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型:
+
+- **数据输入**:一个双层序列进入`recurrent_group`会被拆解为一个单层序列,一个单层序列进入`recurrent_group`会被拆解为非序列,然后交给step函数,这一过程对用户是完全透明的。可以有以下两种:1)通过data_layer拿到的用户输入;2)其它layer的输出。
+
+- **只读Memory输入**:`StaticInput` 定义了一个只读的Memory,由`StaticInput`指定的输入不会被`recurrent_group`拆解,`recurrent_group` 循环展开的每个时间步总是能够引用所有输入,可以是一个非序列,或者一个单层序列。
+
+- **序列生成任务的输入**:`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构,encoder和decoder可以是能够处理序列的任意神经网络单元,而RNN是最流行的选择。
+
+给定encoder输出和当前词,decoder每次预测产生下一个最可能的词语。在这种结构中,decoder接受两个输入:
+
+- 要生成的目标序列:是decoder的数据输入,也是decoder循环展开的依据,`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出,可以是一个非序列,或者一个单层序列:是一个unbounded memory,decoder循环展开的每一个时间步会引用全部结果,不应该被拆解,这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+
+在序列生成任务中,decoder RNN总是引用上一时刻预测出的词的词向量,作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+
+### 输出
+`step`函数必须返回一个或多个Layer的输出,这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中,`recurrent_group` 会将每个时间步的输出拼接,这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在,必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出,因此,可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时,memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分,对输出的合并,以及计算逻辑在序列上的循环展开。
+
+利用这种特性,两个嵌套的`recurrent_group`能够处理双层序列,实现词语和句子两个级别的双层RNN结构。
+
+- 单层(word-level)RNN:每个状态(state)对应一个词(word)。
+- 双层(sequence-level)RNN:一个双层RNN由多个单层RNN组成,每个单层RNN(即双层RNN的每个状态)对应一个子句(subseq)。
+
+为了描述方便,下文以NLP任务为例,将含有子句(subseq)的段落定义为一个双层序列,将含有词语的句子定义为一个单层序列,那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定:
+
+- **单进单出**:输入和输出都是单层序列。
+ - 如果有多个输入,不同输入序列含有的词语数必须严格相等。
+ - 输出一个单层序列,输出序列的词语数和输入序列一致。
+ - memory:在step函数中定义 memory指向一个layer,通过引用memory得到这个layer上一个时刻输出,形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory,每个时间步之内的运算是独立的。
+ - boot_layer:memory的初始状态,默认初始状为0,memory的is_seq参数必须为false。
+
+- **双进双出**:输入和输出都是双层序列。
+ - 如果有多个输入序列,不同输入含有的子句(subseq)数必须严格相等,但子句含有的词语数可以不相等。
+ - 输出一个双层序列,子句(subseq)数、子句的单词数和指定的一个输入序列一致,默认为第一个输入。
+ - memory:在step函数中定义memory,指向一个layer,通过引用memory得到这个layer上一个时刻的输出,形成recurrent连接。定义在外层`recurrent_group` step函数中的memory,能够记录上一个subseq 的状态,可以是一个单层序列(只作为read-only memory),也可以是一个词语。如果没有定义memory,那么 subseq 之间的运算是独立的。
+ - boot_layer:memory 初始状态,可以是一个单层序列(只作为read-only memory)或一个向量。默认不设置,即初始状态为0。
+
+- **双进单出**:目前还未支持,会报错"In hierachical RNN, all out links should be from sequences now"。
+
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定:
+
+- 单层RNN:从一个word生成下一个word。
+- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst
index 44aa2a0983f4fd..a5f5fb117e11e8 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本
+-----------------+------------------+------------------------+-----------------------+
| GPU | gpu-latest | gpu-devel-latest | gpu-demo-latest |
+-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
+| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
+-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
+| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
+-----------------+------------------+------------------------+-----------------------+
其中,横向包括三个版本,normal,devel和demo。
diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in
index 391f7981eab809..93242ace406000 100644
--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -47,6 +47,7 @@ extensions = [
'sphinx.ext.autosummary',
'sphinx.ext.mathjax',
'sphinx.ext.napoleon',
+ 'sphinx.ext.graphviz'
]
table_styling_embed_css = True
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
new file mode 100644
index 00000000000000..283607957ce630
--- /dev/null
+++ b/doc_cn/faq/index.rst
@@ -0,0 +1,169 @@
+####################
+PaddlePaddle常见问题
+####################
+
+.. contents::
+
+1. 如何减少PaddlePaddle的内存占用
+---------------------------------
+
+神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+
+* DataProvider缓冲池内存 (只针对内存)
+* 神经元激活内存 (针对内存和显存)
+* 参数内存 (针对内存和显存)
+* 其他内存杂项
+
+这其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,
+这些内存就不考虑如何缩减了。
+
+其他的内存的减少方法依次为
+
+
+减少DataProvider缓冲池内存
+++++++++++++++++++++++++++
+
+PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
+
+.. graphviz::
+
+ digraph {
+ rankdir=LR;
+ 数据文件 -> 内存池 -> PaddlePaddle训练
+ }
+
+所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
+个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+
+.. literalinclude:: reduce_min_pool_size.py
+
+这样做可以极大的减少内存占用,并且可能会加速训练过程。 详细文档参考 `这里
+<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+
+神经元激活内存
+++++++++++++++
+
+神经网络在训练的时候,会对每一个激活暂存一些数据,包括激活,參差等等。
+在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
+一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
+的时间步信息成正比。
+
+所以,做法可以有两种。他们是
+
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
+* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
+ 但是突然有一个10000长的序列,就很容易导致内存超限。特别是在LSTM等RNN中。
+
+参数内存
+++++++++
+
+PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
+例如如果使用 :code:`adadelta` 算法,则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
+
+可以考虑使用一些优化算法,例如 :code:`momentum`。
+
+2. 如何加速PaddlePaddle的训练速度
+---------------------------------
+
+PaddlePaddle是神经网络训练平台,加速PaddlePaddle训练有如下几个方面\:
+
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用更多的计算资源
+
+减少数据载入的耗时
+++++++++++++++++++
+
+使用 :code:`pydataprovider`时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
+
+.. literalinclude:: reduce_min_pool_size.py
+
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+
+
+加速训练速度
+++++++++++++
+
+PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
+
+这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
+
+使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
+
+.. literalinclude:: word2vec_dataprovider.py
+
+这个任务的配置为\:
+
+.. literalinclude:: word2vec_config.py
+
+更多关于sparse训练的内容请参考 `sparse训练的文档 `_
+
+利用更多的计算资源
+++++++++++++++++++
+
+利用更多的计算资源可以分为一下几个方式来进行\:
+
+* 单机CPU训练
+ * 使用多线程训练。设置命令行参数 :code:`trainer_count`,即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+* 单机GPU训练
+ * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
+ * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练,使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+* 多机训练
+ * 使用多机训练的方法也比较简单,需要先在每个节点启动 :code:`paddle pserver`,在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
+ * 具体的多机训练方法参考 `多机训练 `_ 文档。
+
+
+3. 遇到“非法指令”或者是“illegal instruction”
+--------------------------------------------
+
+paddle在进行计算的时候为了提升计算性能,使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。(另:用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉,请当成是不支持,看下面的解决方案)
+
+解决办法是\:
+
+* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
+* 或者,使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
+
+
+4. 如何选择SGD算法的学习率
+--------------------------
+
+在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
+
+通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
+
+如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
+
+
+5. 如何初始化参数
+-----------------
+
+默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
+
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
+
+.. code-block:: python
+
+ hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+ bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+
+6. 如何共享参数
+---------------
+
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+
+简单的全连接网络,参数共享的配置示例为\:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
+
+
diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc_cn/faq/reduce_min_pool_size.py
new file mode 100644
index 00000000000000..2811b134b66b1e
--- /dev/null
+++ b/doc_cn/faq/reduce_min_pool_size.py
@@ -0,0 +1,6 @@
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+ os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before.
+ with open('%s.shuf' % filename, 'r') as f:
+ for line in f:
+ yield get_sample_from_line(line)
\ No newline at end of file
diff --git a/doc_cn/faq/word2vec_config.py b/doc_cn/faq/word2vec_config.py
new file mode 100644
index 00000000000000..e347252476eab6
--- /dev/null
+++ b/doc_cn/faq/word2vec_config.py
@@ -0,0 +1,8 @@
+... # the settings and define data provider is omitted.
+DICT_DIM=3000 # dictionary dimension.
+word_ids=data_layer('word_ids', size=DICT_DIM)
+
+emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM)))
\ No newline at end of file
diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc_cn/faq/word2vec_dataprovider.py
new file mode 100644
index 00000000000000..a0a39080cece90
--- /dev/null
+++ b/doc_cn/faq/word2vec_dataprovider.py
@@ -0,0 +1,8 @@
+DICT_DIM=3000
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+ with open(filename) as f:
+ # yield word ids to predict inner word id
+ # such as [28, 29, 10, 4], 4
+ # It means the sentance is 28, 29, 4, 10, 4.
+ yield read_next_from_file(f)
\ No newline at end of file
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
index 6cf5588b5b34f5..d2d50fbdb47f27 100644
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,6 +3,7 @@ PaddlePaddle文档
使用指南
--------
+
* `快速入门 `_
* `编译与安装 `_
* `用户接口 `_
@@ -16,4 +17,13 @@ PaddlePaddle文档
算法教程
--------
-* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
+
+* `Recurrent Group教程 `_
+* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
+* `双层RNN示例 `_
+* `支持双层序列作为输入的Layer `_
+
+常见问题
+--------
+
+* `常见问题 `_
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py
index 4eab5b1fd3b50a..bf13b56372b56a 100644
--- a/doc_cn/ui/data_provider/mnist_provider.dict.py
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -2,10 +2,10 @@
# Define a py data provider
-@provider(input_types=[
- dense_vector(28 * 28),
- integer_value(10)
-])
+@provider(input_types={
+ 'pixel': dense_vector(28 * 28),
+ 'label': integer_value(10)
+})
def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file
@@ -20,6 +20,6 @@ def process(settings, filename): # settings is not used currently.
pixels_float.append(float(each_pixel_str))
# give data to paddle.
- yield { "pixel": pixels_float, 'label': int(label) }
+ yield {"pixel": pixels_float, 'label': int(label)}
f.close() # close file
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
index 9e1d8c531f5ba2..80b40084d8f503 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size
* cache 是数据缓存的策略,参考 `cache`_
* init_hook 是初始化时调用的函数,参考 `init_hook`_
-* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以
- 返回一个list或者tuple。如果是false的话,只能够返回list或者tuple
* check 设置成true的话,会根据input_types检查数据的合法性。
* check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果
check是false的话,没有作用。
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index f7019b27f8f02a..657fdf65e92c9d 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
esac
done
shift $(($OPTIND - 1))
- export PYTHONPATH=$PYPATH
+ export PYTHONPATH=$PYPATH:$PYTHONPATH
$@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
old mode 100644
new mode 100755
index e03a9a1baa0041..cdb730bb3cec7a
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -2,10 +2,17 @@ set(AVX_SOURCES
src/hl_math.cc
src/hl_avx_functions.cc
)
-set(CUDA_SOURCES
- src/hl_time.cc
- src/hl_cpu_functions.cc
- ${AVX_SOURCES})
+
+if(WITH_AVX)
+ set(CUDA_SOURCES
+ src/hl_time.cc
+ src/hl_cpu_functions.cc
+ ${AVX_SOURCES})
+else()
+ set(CUDA_SOURCES
+ src/hl_time.cc
+ src/hl_cpu_functions.cc)
+endif()
set(CUDA_CXX_WITH_GPU_SOURCES
src/hl_cuda_cublas.cc
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 77e2649b172144..1fe2774cc5a291 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -185,7 +185,7 @@ typedef struct {
size_t nnz;
} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
/**
* HPPL data type: real (float or double)
*
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index aa4720f6ca749f..b5240da0f398c8 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -169,7 +169,7 @@ extern void hl_avgpool_forward(
* @brief Maximum pool backward.
*
* @param[in] frameCnt batch size of input image.
- * @param[in] outGrad input data.
+ * @param[in] outGrad output grad data.
* @param[in] channels number of channel.
* @param[in] height image height.
* @param[in] width image width.
@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad,
const size_t outputW,
const size_t numChannels);
+/**
+ * @brief MaxOut forward.
+ *
+ * @param[in] inData input data.
+ * @param[out] outData output data.
+ * @param[out] idData output maxId.
+ * @param[in] batchSize batchSize.
+ * @param[in] size number of channels * image height * image width.
+ * @param[in] featLen feature length = image height * image width.
+ * @param[in] groups number of groups.
+ */
+extern void hl_maxout_forward(
+ const real* inData, real* outData, int* idData,
+ size_t batchSize, size_t size, size_t featLen, size_t groups);
+
+/**
+ * @brief MaxOut backward.
+ *
+ * @param[out] inGrad input grad data.
+ * @param[in] outGrad output grad data.
+ * @param[in] idData output maxId.
+ * @param[in] batchSize batchSize.
+ * @param[in] size number of channels * image height * image width.
+ * @param[in] featLen feature length = image height * image width.
+ * @param[in] groups number of groups.
+ */
+extern void hl_maxout_backward(
+ real* inGrad, const real* outGrad, const int* idData,
+ size_t batchSize, size_t size, size_t featLen, size_t groups);
+
#endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh
index cba1c9f30da8d5..d39cf67448b4f2 100644
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/math/MathFunctions.h"
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm
#else
#define CBLAS_GEMM paddle::gemm
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/cuda/include/hl_gpu_functions.cuh
index 38df4eb8958f21..a2c5ebd18a4403 100644
--- a/paddle/cuda/include/hl_gpu_functions.cuh
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
@@ -28,7 +28,7 @@ namespace hppl {
const real min = SIGMOID_THRESHOLD_MIN;
const real max = SIGMOID_THRESHOLD_MAX;
real tmp = (a < min) ? min : ((a > max) ? max : a);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
return __fdividef(1.0f, 1.0f + __expf(-tmp));
#else
return 1.0 / (1.0 + exp(-tmp));
@@ -36,7 +36,7 @@ namespace hppl {
}
__device__ static real tanh(const real a) {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
#else
return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index 473d394c0c688d..a3645ef51e6ef7 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -30,7 +30,7 @@ limitations under the License. */
#define INLINE inline
#endif
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
#define DEVICE_FMAX fmaxf
#define DEVICE_FMIN fminf
#else
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 6917f362901411..51e483d1fb2ff3 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -21,7 +21,7 @@ limitations under the License. */
#ifdef __CUDA_ARCH__
// typedef void* vecType;
#include
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType;
#else
typedef double2 vecType;
@@ -30,7 +30,7 @@ typedef double2 vecType;
#include
#include
#include
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
typedef __m128 vecType;
#else
typedef __m128d vecType;
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 828c21beb2fbd4..46d86b2982f065 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
*/
extern void hl_sequence2batch_copy(real *batch,
real *sequence,
- int *batchIndex,
+ const int *batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
index c90d49e4adeb5e..45db2f313e0d6e 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
#define VECTOR_SIZE 16
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1
@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) {
return hl_check_align(reinterpret_cast(ptr));
}
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
template
inline real hl_agg_op(Agg agg, vecType mm) {
__m128 lo = _mm_unpacklo_ps(mm, mm);
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index aa9442fb80237e..cf79fad9004cd8 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad,
const size_t outputW,
const size_t numChannels) {}
+inline void hl_maxout_forward(
+ const real* inData, real* outData, int* idData,
+ size_t batchSize, size_t size, size_t featLen, size_t group) {}
+
+inline void hl_maxout_backward(
+ real* inGrad, const real* outGrad, const int* idData,
+ size_t batchSize, size_t size, size_t featLen, size_t group) {}
+
#endif // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 417f40e0a69f6c..aabd956c37f7dc 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
inline void hl_sequence2batch_copy(real *batch,
real *sequence,
- int *batchIndex,
+ const int *batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index f965adc13575c1..499b61195af5e1 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad,
threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad,
outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW);
CHECK_SYNC("hl_bilinear_backward failed");
-}
\ No newline at end of file
+}
+
+__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
+ real * outData, int* idData,
+ size_t size, size_t featLen, size_t groups) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if(index < nthreads) {
+ size_t batch_idx = index / size;
+ size_t i = index % size;
+ size_t channel_idx = i / featLen;
+ size_t feat_idx = i % featLen;
+ size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+ real max = inData[data_idx];
+ int maxId = 0;
+ for (size_t g = 1; g < groups; ++g) {
+ real tmp = inData[data_idx + g * featLen];
+ if (tmp > max) {
+ max = tmp;
+ maxId = g;
+ }
+ }
+ outData[index] = max;
+ idData[index] = maxId;
+ }
+}
+
+void hl_maxout_forward(const real* inData, real* outData,
+ int* idData, size_t batchSize, size_t size,
+ size_t featLen, size_t groups) {
+ int num_kernels = size * batchSize;
+ int blocks = (num_kernels + 1024 - 1) / 1024;
+ maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+ num_kernels, inData, outData, idData, size, featLen, groups);
+ CHECK_SYNC("hl_maxout_forward failed");
+}
+
+__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
+ const real* outGrad, const int* idData,
+ size_t size, size_t featLen, size_t groups) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if(index < nthreads) {
+ size_t batch_idx = index / size;
+ size_t i = index % size;
+ size_t channel_idx = i / featLen;
+ size_t feat_idx = i % featLen;
+ size_t newIndex = batch_idx * size;
+ size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+ (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
+ }
+}
+
+void hl_maxout_backward(real* inGrad, const real* outGrad,
+ const int* idData, size_t batchSize, size_t size,
+ size_t featLen, size_t groups) {
+ int num_kernels = size * batchSize;
+ int blocks = (num_kernels + 1024 - 1) / 1024;
+ maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
+ num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+ CHECK_SYNC("hl_maxout_backward failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index dc109487ded20f..b3c9001ba39736 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
} /* namespace dynload */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
#define CUBLAS_GEAM dynload::cublasSgeam
#define CUBLAS_GEMV dynload::cublasSgemv
#define CUBLAS_GEMM dynload::cublasSgemm
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index c2dce1977bdf5d..b215c0f6e33a18 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
(cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
CHECK_NOTNULL(hl_desc);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
(cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
CHECK_NOTNULL(hl_desc);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input,
int height,
int width)
{
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value,
int height,
int width)
{
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index f4c07367b485b8..e9fe9f1c117a05 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) {
void hl_rand(real *dest_d, size_t num) {
pthread_mutex_lock(t_resource.gen_mutex);
CHECK_EQ(
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
#else
dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 38e4f16217c2a4..067e68c41e1198 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d,
CHECK_SYNC("hl_matrix_add failed");
}
-#ifdef HPPL_TYPE_DOUBLE
+#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128
#else
#define THRESHOLD 64
@@ -102,7 +102,7 @@ void subMaxAndExp(real* I,
val = -THRESHOLD;
}
I[nextIdx] = val;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
O[nextIdx] = __expf(val);
#else
O[nextIdx] = exp(val);
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index e028880156e5b1..63824eaa4c201c 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -374,7 +374,7 @@ template
__global__
void KeSequence2Batch(real *batch,
real *sequence,
- int *batchIndex,
+ const int *batchIndex,
int seqWidth,
int batchCount) {
int idx = threadIdx.x;
@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
void hl_sequence2batch_copy(real *batch,
real *sequence,
- int *batchIndex,
+ const int *batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index 13e89390d68c22..c3b98f4ebc38db 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
}
/* best perf */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
#define CU_CSCMM_THREAD_M_BEST 9
#else
#define CU_CSCMM_THREAD_M_BEST 4
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index c3b4769f7612b7..8cefbb30ada46d 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
}
}
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu,
+DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+ bool useGpu,
int64_t batchSize) {
batchSize_ = batchSize;
dataPool_ = dataPool;
@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
}
void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+ while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) { // time out
+ if (stopping_) return;
+ }
BufferBatch* bufBatch = bufferQueue_->dequeue();
// clone and copy the data from an Threadlocal Variable
bufBatch->clone(batch, useGpu_);
@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() {
actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
}
insertOneBatch(&newBatch);
- } while (actualSize > 0);
+ } while (actualSize > 0 && !stopping_);
}
}
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 534491d70d5467..112e45de1cb232 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -259,7 +259,9 @@ typedef Queue BufferBatchQueue;
class DoubleBuffer {
public:
- DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+ DoubleBuffer(DataProvider* dataPool,
+ bool useGpu,
+ int64_t batchSize = 0);
virtual ~DoubleBuffer();
void removeOneBatch(DataBatch* dataBatch);
@@ -308,7 +310,8 @@ class DataProvider {
/**
* @brief create only used for unittest.
*/
- inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+ inline static DataProvider* create(const DataConfig &config,
+ bool useGpu = FLAGS_use_gpu) {
return create(config, ModelConfig(), useGpu);
}
@@ -348,7 +351,6 @@ class DataProvider {
*/
virtual void reset() {
if (doubleBuffer_ != nullptr) {
- LOG(INFO) << "the double-buffer is starting ...";
doubleBuffer_->startAsyncLoad();
}
}
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 2f9a1223c6e454..ca8b07af49ca07 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -14,13 +14,20 @@ limitations under the License. */
#ifndef PADDLE_NO_PYTHON
+#include
#include
#include
#include
#include
+#include
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include
#include "DataProvider.h"
+
#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
namespace paddle {
@@ -202,7 +209,10 @@ class PyDataProvider2 : public DataProvider {
PyDataProvider2(const DataConfig& config,
const ModelConfig& modelConfig,
bool useGpu)
- :DataProvider(config, useGpu), callingContextCreated_(2) {
+ :DataProvider(config, useGpu),
+ callingContextCreated_(2) {
+ if (PyArray_API == NULL)
+ import_array();
auto& args = config.load_data_args();
PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
if (!args.empty()) {
@@ -246,8 +256,7 @@ class PyDataProvider2 : public DataProvider {
PyObjectPtr && kwargs) {
LOG(INFO) << "loading dataprovider " << model <<"::" << className;
- PyObjectPtr module(PyImport_ImportModule(model.c_str()));
- CHECK_PY(module) << "Cannot imort module " << model.c_str();
+ PyObjectPtr module = py::import(model);
PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
@@ -455,6 +464,7 @@ class PyDataProvider2 : public DataProvider {
std::condition_variable pushCV_;
std::condition_variable pullCV_;
std::mutex mtx_;
+
ThreadBarrier callingContextCreated_;
std::unique_ptr cache_;
@@ -497,8 +507,8 @@ class PyDataProvider2 : public DataProvider {
* Resetting the PyDataProvider. May start reading thread here.
*/
virtual void reset() {
- DataProvider::reset();
resetImpl(true);
+ DataProvider::reset();
}
/**
@@ -519,6 +529,7 @@ class PyDataProvider2 : public DataProvider {
* Loading a batch of data.
*/
int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+ REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0);
size_t size = (size_t) size_;
if (loadThread_) { // loading from thread should wait for data pool ready.
@@ -699,10 +710,22 @@ class DenseScanner: public IFieldScanner {
*/
virtual void fill(Argument &argument, PyObject *obj) {
real* dat = argument.value->getData() + height_ * headerPtr_->dim;
- py::SequenceHelper s(obj);
- // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
- for (size_t i=0; i < headerPtr_->dim; ++i) {
- dat[i] = (real) s.getDouble(i);
+ if (PyArray_Check(obj)) {
+ auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+ if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+ real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
+ auto sz = PyArray_SIZE((PyArrayObject*)obj);
+ std::copy(data, data + sz, dat);
+ } else {
+ LOG(FATAL) << "You should yield float" << sizeof(real) * 8
+ << " array";
+ }
+ } else {
+ py::SequenceHelper s(obj);
+ // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+ for (size_t i=0; i < headerPtr_->dim; ++i) {
+ dat[i] = (real) s.getDouble(i);
+ }
}
++height_;
}
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 273925ba55ee40..22579891f397af 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
public:
virtual void init(const EvaluatorConfig& config) {
- CHECK(!FLAGS_use_gpu) << "Not supported";
Evaluator::init(config);
if (config.chunk_scheme() == "IOB") {
numTagTypes_ = 2;
@@ -137,6 +136,7 @@ class ChunkEvaluator : public Evaluator {
CHECK_EQ(arguments.size(), (size_t)2);
IVectorPtr& output = arguments[0].ids;
IVectorPtr& label = arguments[1].ids;
+ CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
auto sequenceStartPositions =
arguments[1].sequenceStartPositions->getVector(false);
CHECK_EQ(output->getSize(), label->getSize());
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 787ce703a08aef..0ded30eeb44e95 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
para->getMat(PARAMETER_GRADIENT).get());
std::vector& ids = mainMat->getIds(threadId_);
- ids.clear();
for (auto slaveParams : slaveParameters) {
SparseRowCpuMatrix* mat =
dynamic_cast((*slaveParams)[pid]
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index fc38bca3c403b2..340cd1b9f8e927 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs,
const std::vector inArgs;
std::vector outArgs;
frames_[i]->forward(inArgs, &outArgs, passType);
+ if (hasSubseq) {
+ for (auto& outFrameLine : outFrameLines_) {
+ CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
+ << "In hierachical RNN, all out links should be from sequences.";
+ }
+ }
}
if (evaluator_ && passType == PASS_TEST) {
this->eval(evaluator_.get());
@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
std::vector sequenceStartPositions;
const int* subSequenceStartPositions = nullptr;
- if (hasSubseq) { // for sequenceScatterAgentLayer
- subSequenceStartPositions =
- input.subSequenceStartPositions->getData(false);
+ if (hasSubseq) { // for sequenceScatterAgentLayer
+ subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
inlinkInfo->seqStartPosIndex.clear();
inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0
}
// maxSequenceLength_: max topLevelLength in allsamples
for (int i = 0; i < maxSequenceLength_; ++i) {
if (hasSubseq) {
- sequenceStartPositions.push_back(0); // first element = 0
+ sequenceStartPositions.push_back(0); // first element = 0
}
int numSeqs = 0;
for (size_t j = 0; j < numSequences; ++j) {
@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
}
if (hasSubseq) {
// inFrameLine create sequenceStartPositions one time
- CHECK_EQ(sequenceStartPositions.size(),
- static_cast(maxSequenceLength_ +
- input.getNumSubSequences()));
+ CHECK_EQ(
+ sequenceStartPositions.size(),
+ static_cast(maxSequenceLength_ + input.getNumSubSequences()));
CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
static_cast(maxSequenceLength_ + 1));
createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector& newPaths,
newPaths.end(), Path::greaterPath);
newPaths.resize(totalExpandCount + minNewPathSize);
- real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize,
- newPaths.end())->logProb;
- real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize,
- newPaths.end())->logProb;
+ real minPathLogProb =
+ std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
+ ->logProb;
+ real maxPathLogProb =
+ std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+ ->logProb;
// Remove the already formed paths that are relatively short
finalPaths_[seqId].erase(
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 056e9568852ac9..5e07446c71ff62 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "AgentLayer.h"
#include "paddle/utils/Logging.h"
@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
// get Arguments from real layers
if (numSamples_ > 0 && numSamples_ < realNumSequences) {
- int numRows = realOutput.sequenceStartPositions->
- getData(false)[numSamples_];
+ int numRows =
+ realOutput.sequenceStartPositions->getData(false)[numSamples_];
CHECK(!realOutput.ids) << "Not supported";
output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
/* trans */ false, /* seqFlag */ true,
@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
int width = this->getSize();
if (realOutArg_.value || realOutArg_.ids) {
- output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
- width, useGpu_);
+ output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
+ useGpu_);
} else { // used in generation
if (realLayer_->getOutput().ids) {
IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
if (realOutArg_.value || realOutArg_.ids) {
CHECK(realOutArg_.sequenceStartPositions);
- output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
- width, useGpu_, /* trans */ false, /* seqFlag */ true,
+ output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
+ useGpu_, /* trans */ false, /* seqFlag */ true,
/* seqStart */ seqStartPosIndex_,
/* seqSize */ numSequences_);
} else {
@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
CHECK_NE(input.sequenceStartPositions.get(),
output_.sequenceStartPositions.get());
ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
- numSequences + 1, false);
+ numSequences + 1, false);
int* outStarts = output_.sequenceStartPositions->getMutableData(false);
- IVector::resizeOrCreate(cpuInputStartPos_, height, false);
- int* inStarts = cpuInputStartPos_->getData();
+ ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
+ int* inStarts = inputStartPos_->getMutableData(false);
+
size_t offsetOut = 0;
for (size_t i = 0; i < numSequences; ++i) {
outStarts[i] = offsetOut;
@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
}
outStarts[numSequences] = offsetOut;
- if (useGpu_) {
- IVector::resizeOrCreate(inputStartPos_, height, true);
- inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
- } else {
- inputStartPos_ = cpuInputStartPos_;
- }
- outputValue->copyByRowIndex(*input.value, *inputStartPos_);
+ outputValue->copyByRowIndex(*input.value,
+ *inputStartPos_->getVector(useGpu_));
}
}
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index d82078dd933294..3d7bf558340707 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
protected:
// use to store expanded cpuStartPositions or subSequenceStartPositions
// of real layer.
- IVectorPtr cpuInputStartPos_;
-
- // point to cpuInputStartPos_ when useGpu_ is false
- // copy from cpuInputStartPos_ when useGpu_ is true
- IVectorPtr inputStartPos_;
+ ICpuGpuVectorPtr inputStartPos_;
public:
explicit SequenceScatterAgentLayer(const LayerConfig& config)
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index 374117b7659bbe..7401cdc9a516bb 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "AverageLayer.h"
#include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
bool AverageLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
- /* Initialize the basic parent class */
- Layer::init(layerMap, parameterMap);
+ SequencePoolLayer::init(layerMap, parameterMap);
- /* initialize biases_ */
- if (biasParameter_.get() != NULL) {
- biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_));
- }
dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
// average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
} else {
LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
}
- // transform to which sequence type
- if (config_.trans_type() == "non-seq") {
- type_ = kNonSeq;
- } else if (config_.trans_type() == "seq") {
- type_ = kSeq;
- } else {
- LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
- }
- setNeedSequenceInfo(false);
return true;
}
void AverageLayer::forward(PassType passType) {
- Layer::forward(passType);
-
- // average layer should have exactly 1 input
- CHECK_EQ(1U, inputLayers_.size());
-
- size_t dim = getSize();
- const Argument& input = getInput(0);
- int64_t newBatchSize =
- type_ ? input.getNumSubSequences() : input.getNumSequences();
- ICpuGpuVectorPtr startPositions =
- type_ ? input.subSequenceStartPositions
- : input.sequenceStartPositions;
- const int* starts = startPositions->getData(false);
- size_t numSequences = startPositions->getSize() - 1;
-
- // check
- CHECK_EQ(numSequences, (size_t)newBatchSize);
- CHECK_EQ(starts[numSequences], input.getBatchSize());
- if (type_) {
- // when trans_type = seq, input must hasSubseq
- CHECK_EQ(input.hasSubseq(), 1UL);
- }
+ SequencePoolLayer::forward(passType);
- CHECK_EQ(dim, input.value->getWidth());
-
- resetOutput(newBatchSize, dim);
- auto startsPos = startPositions->getVector(useGpu_);
MatrixPtr inputValue = getInputValue(0);
- getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
-
- /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
- * thus, in this case, output_ has no sequenceStartPositions.
- * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
- * case, we should compute the new sequenceStartPositions.
- */
- if (type_) {
- output_.degradeSequence(input, useGpu_);
- }
+ getOutputValue()->sequenceAvgForward(
+ *inputValue, *startPositions_->getVector(useGpu_), mode_);
/* add the bias-vector AFTER average operation */
if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
}
void AverageLayer::backward(const UpdateCallback& callback) {
- const Argument& input = getInput(0);
- ICpuGpuVectorPtr startPositions =
- type_ ? input.subSequenceStartPositions
- : input.sequenceStartPositions;
- const int* starts = startPositions->getData(false);
- /* Do derivation */ { backwardActivation(); }
-
- if (biases_ && biases_->getWGrad()) {
- biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
- // Increasing the number of gradient
- biases_->getParameterPtr()->incUpdate(callback);
- }
+ SequencePoolLayer::backward(callback);
+ const int* starts = startPositions_->getData(false);
MatrixPtr grad = getInputGrad(0);
+
if (grad) {
size_t dim = getSize();
real* gradientData = getInputGrad(0)->getData();
real* gradient = getOutputGrad()->getData();
- size_t numSequences = startPositions->getSize() - 1;
+ size_t numSequences = startPositions_->getSize() - 1;
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
// TODO(Dangqingqing) optimization for GPU
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index ae910ddefad137..1edc2ace492c5b 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
/**
* A layer for "internal average" for sequence input.
* Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
* output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
*/
-
-class AverageLayer : public Layer {
+class AverageLayer : public SequencePoolLayer {
public:
enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
- enum AverageLevel { kNonSeq = 0, kSeq = 1 };
- explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+ explicit AverageLayer(const LayerConfig& config)
+ : SequencePoolLayer(config) {}
~AverageLayer() {}
@@ -46,11 +46,8 @@ class AverageLayer : public Layer {
void backward(const UpdateCallback& callback = nullptr);
protected:
- std::unique_ptr biases_;
MatrixPtr outMtx_;
MatrixPtr dataMtx_;
int mode_;
- int type_;
};
-
} // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
index bbd0b53273b430..9290ce4f6d46c1 100644
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "ExpandLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
const Argument& shapeInput = getInput(1);
const Argument& dataInput = getInput(0);
size_t outputBatchSize = shapeInput.getBatchSize();
- auto startPositions =
- type_ ? shapeInput.subSequenceStartPositions
- : shapeInput.sequenceStartPositions;
+ auto startPositions = type_ ? shapeInput.subSequenceStartPositions
+ : shapeInput.sequenceStartPositions;
size_t numSequences = startPositions->getSize() - 1;
const int* starts = startPositions->getData(false);
@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
// set output sequence info as shape sequence
output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
if (shapeInput.hasSubseq()) {
- output_.subSequenceStartPositions =
- shapeInput.subSequenceStartPositions;
+ output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
}
// reserve output: Expand output to batchsize of sequence data.
@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
- IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
- int* expandStarts = cpuExpandStartsPos_->getData();
+ ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
+ int* expandStarts = expandStartsPos_->getMutableData(false);
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
for (int j = 0; j < sequenceLength; j++) {
@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
}
}
- if (useGpu_) {
- // TODO(Dangqingqing) move copyFrom
- IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
- expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
- } else {
- expandStartsPos_ = cpuExpandStartsPos_;
- }
-
- outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
+ outputValue->copyByRowIndex(*inputValue,
+ *expandStartsPos_->getVector(useGpu_));
if (biases_.get() != NULL) {
outputValue->addBias(*(biases_->getW()), 1);
@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
void ExpandLayer::backward(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
- /* Increasing the number of gradient */
+ /* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback);
}
if (!getInputGrad(0)) return;
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
- auto cpuSeqStartPos =
- type_ ? getInput(1).subSequenceStartPositions
- : getInput(1).sequenceStartPositions;
+ auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
+ : getInput(1).sequenceStartPositions;
size_t numSequences = cpuSeqStartPos->getSize() - 1;
const int* starts = cpuSeqStartPos->getData(false);
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 8a3eb1c973a475..fbe0ced9b1754d 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -44,14 +44,9 @@ class ExpandLayer : public Layer {
enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
/// store the ExpandLevel
int type_;
- // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
- // and expandStartsPos_
/// expanded sequenceStartPositions or subSequenceStartPositions
/// of input[1]
- IVectorPtr cpuExpandStartsPos_;
- /// point to cpuExpandStartsPos_ when useGpu_ is false,
- /// copy from cpuExpandStartsPos_ when useGpu_ is true
- IVectorPtr expandStartsPos_;
+ ICpuGpuVectorPtr expandStartsPos_;
public:
explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index 226e0ea87dbd4a..c4ffe894eccd61 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "MaxLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
REGISTER_LAYER(max, MaxLayer);
-bool MaxLayer::init(const LayerMap& layerMap,
- const ParameterMap& parameterMap) {
- /* Initialize the basic parent class */
- Layer::init(layerMap, parameterMap);
-
- /* initialize biases_ */
- if (biasParameter_.get() != NULL) {
- biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_));
- }
-
- // transform to which sequence type
- if (config_.trans_type() == "non-seq") {
- type_ = kNonSeq;
- } else if (config_.trans_type() == "seq") {
- type_ = kSeq;
- } else {
- LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
- }
- setNeedSequenceInfo(false);
- return true;
-}
-
void MaxLayer::forward(PassType passType) {
- Layer::forward(passType);
- // max layer should have exactly 1 input
- CHECK_EQ(1U, inputLayers_.size());
-
- size_t dim = getSize();
- const Argument& input = getInput(0);
- int64_t newBatchSize =
- type_ ? input.getNumSubSequences() : input.getNumSequences();
- ICpuGpuVectorPtr startPositions =
- type_ ? input.subSequenceStartPositions
- : input.sequenceStartPositions;
- auto starts = startPositions->getVector(useGpu_);
- size_t numSequences = startPositions->getSize() - 1;
+ SequencePoolLayer::forward(passType);
- CHECK_EQ(dim, input.value->getWidth());
- CHECK_EQ(numSequences, (size_t)newBatchSize);
- CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
- if (type_) {
- // when trans_type = seq, input must hasSubseq
- CHECK_EQ(input.hasSubseq(), 1UL);
- }
-
- // reset output: resize to "num of sequences", not "batch size".
- resetOutput(newBatchSize, dim);
-
- IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
+ IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
+ useGpu(deviceId_));
maxIndex_->zeroMem();
MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
{
REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
- outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
- }
-
- /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
- * thus, in this case, output_ has no cpuSequenceStartPositions.
- * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
- * case, we should compute the new cpuSequenceStartPositions.
- */
- if (type_) {
- output_.degradeSequence(input, useGpu_);
+ outputValue->maxSequenceForward(
+ *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
}
if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
void MaxLayer::backward(const UpdateCallback& callback) {
CHECK(!config_.output_max_index())
<< "backward is not available when output_max_index is set";
- /* Do derivation */ { backwardActivation(); }
-
- if (biases_ && biases_->getWGrad()) {
- biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
- // Increasing the number of gradient
- biases_->getParameterPtr()->incUpdate(callback);
- }
+ SequencePoolLayer::backward(callback);
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
if (inputGrad) {
- ICpuGpuVectorPtr starts =
- type_ ? getInput(0).subSequenceStartPositions
- : getInput(0).sequenceStartPositions;
REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
- inputGrad->maxSequenceBackward(*outputGrad,
- *(starts->getVector(useGpu_)), *maxIndex_);
+ inputGrad->maxSequenceBackward(
+ *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
}
}
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index b4c34e665d926d..e6dcfe9c6759d1 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/ThreadLocal.h"
@@ -24,29 +24,30 @@ namespace paddle {
/**
* A layer for "internal max" for sequence input.
* Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
* output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
*/
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
protected:
- std::unique_ptr biases_;
// maxIndex_[i][j] = k : the value at (i, j) is from input[k].
IVectorPtr maxIndex_;
- int type_;
public:
- explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
- enum MaxLevel {kNonSeq = 0, kSeq = 1 };
+ explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
~MaxLayer() {}
- bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+ return SequencePoolLayer::init(layerMap, parameterMap);
+ }
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp
new file mode 100644
index 00000000000000..a3de069bf7a6c9
--- /dev/null
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxOutLayer.h"
+#include "hl_gpu.h"
+#include "hl_cnn.h"
+
+namespace paddle {
+
+REGISTER_LAYER(maxout, MaxOutLayer);
+
+size_t MaxOutLayer::getSize() {
+ const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
+ imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+ imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+ if (imgSizeH_ == 0) {
+ imgSizeH_ = maxoutConf.img_size_y();
+ }
+ if (imgSizeW_ == 0) {
+ imgSizeW_ = maxoutConf.img_size_x();
+ }
+
+ featLen_ = imgSizeH_ * imgSizeW_;
+ size_t layerSize = featLen_ * outputChannels_;
+
+ getOutput().setFrameHeight(imgSizeH_);
+ getOutput().setFrameWidth(imgSizeW_);
+
+ return layerSize;
+}
+
+bool MaxOutLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ /* Initialize the basic parent class */
+ Layer::init(layerMap, parameterMap);
+
+ /* the size of inputs for maxout-layer is 1 */
+ CHECK_EQ(config_.inputs_size(), 1);
+
+ const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
+ groups_ = conf.groups();
+ channels_ = conf.channels();
+ CHECK_EQ(channels_ % groups_, 0UL);
+ outputChannels_ = channels_ / groups_;
+
+ return true;
+}
+
+void MaxOutLayer::forward(PassType passType) {
+ Layer::forward(passType);
+
+ /* malloc memory for the output_ if necessary */
+ /* note: one sample correspond to one column */
+ size_t batchSize = getInput(0).getBatchSize();
+ size_t size = getSize();
+ resetOutput(batchSize, size);
+ MatrixPtr inputV = getInputValue(0);
+ MatrixPtr outV = getOutputValue();
+
+ IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
+ outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
+}
+
+void MaxOutLayer::backward(const UpdateCallback& callback) {
+ (void)callback;
+
+ /* Do derivation */
+ MatrixPtr inputG = getInputGrad(0);
+ MatrixPtr outG = getOutputGrad();
+
+ if (inputG) {
+ inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
new file mode 100644
index 00000000000000..9011a5c332b17a
--- /dev/null
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input. Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+
+class MaxOutLayer : public Layer {
+protected:
+ size_t groups_;
+ size_t imgSizeH_, imgSizeW_;
+ /// outputChannels_ = channels_ / groups_
+ size_t channels_, outputChannels_;
+ /// feature length = imgSizeH_ * imgSizeW_
+ size_t featLen_;
+ IVectorPtr maxoutId_;
+
+public:
+ /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+ size_t getSize();
+
+ explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+ virtual ~MaxOutLayer() {}
+
+ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+ void forward(PassType passType);
+ void backward(const UpdateCallback& callback = nullptr);
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index a896e16a6027b3..4faebe5d2ad6f9 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -21,14 +21,18 @@ limitations under the License. */
namespace paddle {
/**
- * Noise-contrastive estimation
+ * Noise-contrastive estimation.
* Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models
+ * A fast and simple algorithm for training neural probabilistic language models.
+ *
+ * The config file api is nce_layer.
*/
class NCELayer : public Layer {
int numClasses_;
- int numInputs_; // number of input layer besides labelLayer and weightLayer
+ /// number of input layer besides labelLayer and weightLayer
+ int numInputs_;
LayerPtr labelLayer_;
+ /// weight layer, can be None
LayerPtr weightLayer_;
WeightList weights_;
std::unique_ptr biases_;
@@ -43,7 +47,8 @@ class NCELayer : public Layer {
real weight;
};
std::vector samples_;
- bool prepared_; // whether samples_ is prepared
+ /// whether samples_ is prepared
+ bool prepared_;
Argument sampleOut_;
IVectorPtr labelIds_;
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 12831e36688029..26d9536dd57aa3 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
-#include "Layer.h"
+#include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Stat.h"
@@ -29,20 +29,19 @@ namespace paddle {
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence
- * of the input sequence
+ * of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
*/
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
protected:
- std::unique_ptr biases_;
MatrixPtr tmpSrc_;
MatrixPtr tmpDest_;
- enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
- int type_;
public:
explicit SequenceLastInstanceLayer(const LayerConfig& config)
- : Layer(config) {}
+ : SequencePoolLayer(config) {}
~SequenceLastInstanceLayer() {}
@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
- /* Initialize the basic parent class */
- Layer::init(layerMap, parameterMap);
-
- // seqlastins layer should have exactly 1 input
- CHECK_EQ(1U, inputLayers_.size());
-
- /* initialize biases_ */
- if (biasParameter_.get() != NULL) {
- biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_));
- }
+ SequencePoolLayer::init(layerMap, parameterMap);
tmpSrc_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
tmpDest_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
- // transform to which sequence type
- if (config_.trans_type() == "non-seq") {
- type_ = kNonSeq;
- } else if (config_.trans_type() == "seq") {
- type_ = kSeq;
- } else {
- LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
- }
- setNeedSequenceInfo(false);
return true;
}
void SequenceLastInstanceLayer::forward(PassType passType) {
- Layer::forward(passType);
-
- size_t dim = getSize();
- const Argument& input = getInput(0);
-
- // check
- auto startPositions =
- type_ ? input.subSequenceStartPositions->getVector(false)
- : input.sequenceStartPositions->getVector(false);
- size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
- CHECK_EQ(dim, input.value->getWidth());
- CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
- CHECK_EQ(height, startPositions->getSize() - 1);
- if (type_) {
- // when trans_type = seq, input must hasSubseq
- CHECK_EQ(input.hasSubseq(), 1UL);
- }
+ SequencePoolLayer::forward(passType);
- reserveOutput(height, dim);
- const int* starts = startPositions->getData();
+ const int* starts = startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
- for (size_t seqId = 0; seqId < height; ++seqId) {
+ for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
int insId =
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
outputValue->subMatrix(seqId, 1, tmpDest_)
->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
}
- /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
- * thus, in this case, output_ has no sequenceStartPositions.
- * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
- * case, we should compute the new sequenceStartPositions.
- */
- if (type_) {
- output_.degradeSequence(input, useGpu_);
- }
}
if (biases_.get() != NULL) {
@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
}
void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
- /* activation, should set to 'linear' in most cases */
- backwardActivation();
-
- if (biases_ && biases_->getWGrad()) {
- biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
- // Increasing the number of gradient
- biases_->getParameterPtr()->incUpdate(callback);
- }
+ SequencePoolLayer::backward(callback);
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
- auto startPositions =
- type_ ? getInput(0).subSequenceStartPositions->getVector(false)
- : getInput(0).sequenceStartPositions->getVector(false);
- const int* starts = startPositions->getData();
- size_t numSequences = startPositions->getSize() - 1;
+ const int* starts = startPositions_->getData(false);
+ size_t numSequences = startPositions_->getSize() - 1;
if (inputGrad) {
AsyncGpuBlock asyncGpuBlock;
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
new file mode 100644
index 00000000000000..55be73d363df19
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+
+namespace paddle {
+
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ /* Initialize the basic parent class */
+ Layer::init(layerMap, parameterMap);
+
+ // seqlastins/max/average layer should have exactly 1 input
+ CHECK_EQ(1U, inputLayers_.size());
+
+ /* initialize biases_ */
+ if (biasParameter_.get() != NULL) {
+ biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_));
+ }
+ // transform to which sequence type
+ if (config_.trans_type() == "non-seq") {
+ type_ = kNonSeq;
+ } else if (config_.trans_type() == "seq") {
+ type_ = kSeq;
+ } else {
+ LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+ }
+ setNeedSequenceInfo(false);
+ return true;
+}
+
+void SequencePoolLayer::forward(PassType passType) {
+ Layer::forward(passType);
+
+ const Argument& input = getInput(0);
+ newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+ size_t dim = getSize();
+ // check
+ CHECK_EQ(dim, input.value->getWidth());
+ startPositions_ =
+ type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+ auto starts = startPositions_->getVector(false);
+ CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+ CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+
+ resetOutput(newBatchSize_, dim);
+ if (type_) {
+ CHECK(input.subSequenceStartPositions)
+ << "when trans_type = seq, input must hasSubseq";
+ }
+ /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+ * thus, in this case, output_ has no sequenceStartPositions.
+ * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+ * case, we should compute the new sequenceStartPositions.
+ */
+ if (type_) {
+ output_.degradeSequence(input, useGpu_);
+ }
+}
+
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+ /* Do derivation */ { backwardActivation(); }
+
+ if (biases_ && biases_->getWGrad()) {
+ biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+ // Increasing the number of gradient
+ biases_->getParameterPtr()->incUpdate(callback);
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 00000000000000..669af80e1d447a
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ * Output: output size is the number of input sequences (NOT input instances)
+ * output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ * Check input sequence must has sub-sequence
+ * Output: output size is the number of input sub-sequences
+ * output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+protected:
+ int type_;
+ std::unique_ptr biases_;
+ enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+ size_t newBatchSize_;
+ ICpuGpuVectorPtr startPositions_;
+
+public:
+ explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+ virtual ~SequencePoolLayer() {}
+
+ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+ void forward(PassType passType);
+ void backward(const UpdateCallback& callback = nullptr);
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 5c3b062309c51f..321c78cb1741bc 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -14,12 +14,15 @@
from paddle.trainer.PyDataProvider2 import *
+# Note that each config should has an independent provider
+# in current design of PyDataProvider2.
+#######################################################
data = [
[[[1, 3, 2], [4, 5, 2]], 0],
[[[0, 2], [2, 5], [0, 1, 2]], 1],
]
-
+# Used for sequence_nest_rnn.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)],
should_shuffle=False)
@@ -27,7 +30,7 @@ def process_subseq(settings, file_name):
for d in data:
yield d
-
+# Used for sequence_rnn.conf
@provider(input_types=[integer_value_sequence(10),
integer_value(3)],
should_shuffle=False)
@@ -38,11 +41,32 @@ def process_seq(settings, file_name):
seq += subseq
yield seq, d[1]
+# Used for sequence_nest_rnn_multi_input.conf
+@provider(input_types=[integer_value_sub_sequence(10),
+ integer_value(3)],
+ should_shuffle=False)
+def process_subseq2(settings, file_name):
+ for d in data:
+ yield d
+
+# Used for sequence_rnn_multi_input.conf
+@provider(input_types=[integer_value_sequence(10),
+ integer_value(3)],
+ should_shuffle=False)
+def process_seq2(settings, file_name):
+ for d in data:
+ seq = []
+ for subseq in d[0]:
+ seq += subseq
+ yield seq, d[1]
+
+###########################################################
data2 = [
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
]
+# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value_sub_sequence(10),
integer_value(2)],
@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name):
yield d
+# Used for sequence_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sequence(10),
integer_value_sequence(10),
integer_value(2)],
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index cbed1f15fc4157..b166e778d7a33f 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -21,7 +21,7 @@
def hook(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sequence(len(settings.word_dict)),
- integer_value_sequence(3)]
+ integer_value(3)]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
@@ -34,14 +34,14 @@ def process(settings, file_name):
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict]
- yield word_slot, [label]
+ yield word_slot, label
## for hierarchical sequence network
def hook2(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
- integer_value_sub_sequence(3)]
+ integer_value_sequence(3)]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
@@ -57,7 +57,7 @@ def process2(settings, file_name):
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict]
- label_list.append([label])
+ label_list.append(label)
word_slot_list.append(word_slot)
else:
yield word_slot_list, label_list
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
index 62b8c5d072d7b4..93b08eb2f8746d 100644
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -56,9 +56,8 @@ def outer_step(x):
last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
# "return last" should also work. But currently RecurrentGradientMachine
- # does not handle it correctly. Current implementation requires that
- # all the out links are from sequences. However, it does not report error
- # when the out links are not sequences.
+ # does not handle it, and will report error: In hierachical RNN, all out
+ # links should be from sequences now.
return inner_rnn_output
out = recurrent_group(
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
index e01b3f8e7aa5c4..0614958b4719dd 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
test_list=None,
module='rnn_data_provider',
- obj='process_subseq')
+ obj='process_subseq2')
settings(batch_size=2, learning_rate=0.01)
@@ -57,9 +57,8 @@ def outer_step(wid, x):
last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
# "return last" should also work. But currently RecurrentGradientMachine
- # does not handle it correctly. Current implementation requires that
- # all the out links are from sequences. However, it does not report error
- # when the out links are not sequences.
+ # does not handle it, and will report error: In hierachical RNN, all out
+ # links should be from sequences now.
return inner_rnn_output
out = recurrent_group(
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
index 968621cab59be9..51881e21d971bb 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
test_list=None,
module='rnn_data_provider',
- obj='process_seq')
+ obj='process_seq2')
settings(batch_size=2, learning_rate=0.01)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 425d669206cce3..db48cc47a4a638 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) {
}
}
+TEST(Layer, maxoutLayer) {
+ TestConfig config;
+ config.biasSize = 0;
+ config.layerConfig.set_type("maxout");
+
+ config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+ LayerInputConfig* input = config.layerConfig.add_inputs();
+ MaxOutConfig* maxout = input->mutable_maxout_conf();
+
+ maxout->set_img_size_x(32);
+ maxout->set_img_size_y(32);
+ maxout->set_channels(4);
+ maxout->set_groups(2);
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "maxout", 10, false, useGpu);
+ }
+}
void testFcLayer(string format, size_t nnz) {
TestConfig config;
config.biasSize = 4096;
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index e75e53ab7f431a..6bf1e329251219 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) {
}
TEST(PyDataProvider2, init_hook) {
- paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle"));
+ paddle::PyObjectPtr pickle = paddle::py::import("pickle");
paddle::PyObjectPtr globals(
PyModule_GetDict(PyImport_AddModule("__main__")));
PyDict_SetItemString(globals.get(), "pickle", pickle.get());
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 145fe85cff7d88..71c3335231e521 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename):
yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
-@provider(input_types=[index_slot(10), index_slot(10)])
+@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
def test_input_order(setting, filename):
for _ in xrange(1000):
yield {
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index ae7f617371ca5f..d104db3e5b32d5 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include
@@ -24,7 +23,7 @@ limitations under the License. */
P_DECLARE_int32(seed);
using namespace paddle; // NOLINT
-using namespace std; // NOLINT
+using namespace std; // NOLINT
class TrainerForTest : public paddle::Trainer {
public:
void startTrain() {
@@ -44,11 +43,10 @@ class TrainerForTest : public paddle::Trainer {
*/
size_t getTotalParameterSize() const {
auto p = const_cast(this);
- auto & params = p->getGradientMachine()->getParameters();
- return std::accumulate(params.begin(), params.end(), 0UL,
- [](size_t a, const ParameterPtr& p){
- return a+p->getSize();
- });
+ auto& params = p->getGradientMachine()->getParameters();
+ return std::accumulate(
+ params.begin(), params.end(), 0UL,
+ [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
}
};
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 469255719701a0..602d7db035deb5 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
copyFrom(matrix);
}
-void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
size_t height = getHeight();
size_t width = getWidth();
CHECK_EQ(b.getWidth(), width);
real* dst = getData();
real* src = b.getData();
- int* index = rowIndex.getData();
+ const int* index = rowIndex.getData();
hl_sequence2batch_copy(dst, src, index, width, height, true);
}
@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) {
max.maxCols(*this);
}
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+ LOG(FATAL) << "Is not supported";
+}
+
+void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+ size_t groups) {
+ CHECK(dynamic_cast(&a));
+ CHECK(dynamic_cast(&id));
+ CHECK_EQ(a.getHeight(), getHeight());
+
+ size_t size = getWidth();
+ size_t batchSize = getHeight();
+ const real* input = a.getData();
+ real* output = getData();
+ int* idForGpu = id.getData();
+
+ hl_maxout_forward(input, output, idForGpu, batchSize, size,
+ size / channels, groups);
+}
+
+void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+ size_t groups) {
+ CHECK(dynamic_cast(&a));
+ CHECK(dynamic_cast(&id));
+ CHECK_EQ(a.getHeight(), getHeight());
+
+ size_t size = a.getWidth();
+ size_t batchSize = getHeight();
+ real* input = getData();
+ const real* output = a.getData();
+ const int* idForGpu = id.getData();
+
+ hl_maxout_backward(input, output, idForGpu, batchSize, size,
+ size / channels, groups);
+}
+
/*calulate the error of classification */
void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
GpuMatrixPtr output_ptr = std::dynamic_pointer_cast(output);
@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
}
}
-void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
size_t height = getHeight();
size_t width = getWidth();
CHECK_EQ(b.getWidth(), width);
- int* index = rowIndex.getData();
+ const int* index = rowIndex.getData();
for (size_t i = 0; i < height; i++) {
CHECK_LT(static_cast(index[i]), b.getHeight());
real* src = b.getData() + index[i] * width;
@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) {
max.maxCols(*this);
}
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+ CHECK(isContiguous());
+ CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+ size_t numSamples = getWidth();
+ size_t beam = maxVal.getHeight();
+ CHECK_EQ(maxIds.getSize(), numSamples * beam);
+ CHECK_EQ(maxVal.getWidth(), numSamples);
+
+ real* a = getData();
+ int* s = maxIds.getData();
+ real* t = maxVal.getData();
+ size_t dim = getHeight();
+ for (size_t i = 0; i < numSamples; i++) {
+ std::vector> vec;
+ for (size_t j = 0; j < dim; j++) {
+ vec.push_back(std::pair(a[i + j * numSamples], j));
+ }
+
+ std::partial_sort(
+ vec.begin(), vec.begin() + beam, vec.end(),
+ [](const std::pair& l, const std::pair& r) {
+ return l.first > r.first;
+ });
+ for (size_t j = 0; j < beam; j++) {
+ t[i + j * numSamples] = vec[j].first;
+ s[i + j * numSamples] = vec[j].second;
+ }
+ }
+}
+
+void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+ size_t groups) {
+ CHECK(dynamic_cast(&a));
+ CHECK(dynamic_cast(&id));
+ CHECK_EQ(a.getHeight(), getHeight());
+
+ size_t size = getWidth();
+ size_t batchSize = getHeight();
+ size_t featLen = size / channels;
+ const real* input = a.getData();
+ int* idForCpu = id.getData();
+
+ MatrixPtr maxInMat, maxOutMat;
+ Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+ Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+
+ for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+ size_t newIndex = batch_idx * size;
+ IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+
+ for (size_t i = 0; i < channels; ++i) {
+ size_t newFeatLen = i * featLen;
+ for (size_t j = 0; j < groups; ++j) {
+ maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+ ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+ featLen);
+ }
+ }
+ maxInMat->colMax(*tmpId, *maxOutMat);
+ this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+ }
+}
+
+void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+ size_t groups) {
+ CHECK(dynamic_cast(&a));
+ CHECK(dynamic_cast(&id));
+ CHECK_EQ(a.getHeight(), getHeight());
+
+ size_t size = a.getWidth();
+ size_t batchSize = getHeight();
+ size_t featLen = size / channels;
+ size_t newFeatLen = groups * featLen;
+ real* inputG = getData();
+ const real* outG = a.getData();
+ int* idForCpu = id.getData();
+
+ for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+ size_t newIndex = batch_idx * size;
+ int* idData = idForCpu + newIndex;
+
+ for (size_t i = 0; i < size; ++i) {
+ int gradIdx =
+ idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+ (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+ }
+ }
+}
+
void CpuMatrix::rowNormalizeL1(Matrix& out) {
CHECK(!out.useGpu());
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index b4922d7e6f5469..9b16ceacbfe98a 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -253,7 +253,7 @@ class Matrix : public BaseMatrix {
LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
}
- virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+ virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
LOG(FATAL) << "Not implemented";
}
@@ -493,16 +493,40 @@ class Matrix : public BaseMatrix {
LOG(FATAL) << "Not implemeted";
}
+ /**
+ * set the max of each column of this to mat
+ */
virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+ /**
+ * @brief Get the top k elements of each column of this matrix.
+ *
+ * The row ids and values of these elements are stored in
+ * maxIds and max respectively. where k is the size of maxIds.
+ * And note that the top k elements are not sorted.
+ */
+ virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+ LOG(FATAL) << "not implemented";
+ }
+
+ virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
+ size_t groups) {
+ LOG(FATAL) << "not implemented";
+ }
+
+ virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
+ size_t groups) {
+ LOG(FATAL) << "not implemented";
+ }
+
virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
/**
* @brief Get the top k elements of each row of this matrix.
*
* The column ids and values of these elements are stored in
- * maxIds and max respectively. Note that the top k
- * elements are not sorted.
+ * maxIds and max respectively. where k is the size of maxIds.
+ * And note that the top k elements are not sorted.
*/
virtual void rowMax(IVector& maxIds, Matrix& max) {
LOG(FATAL) << "Not implemented";
@@ -995,7 +1019,7 @@ class GpuMatrix : public Matrix {
void copyFrom(const IVector& src);
- void copyByRowIndex(Matrix& b, IVector& rowIndex);
+ void copyByRowIndex(Matrix& b, const IVector& rowIndex);
MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1101,6 +1125,9 @@ class GpuMatrix : public Matrix {
void rowMax(Matrix& max);
void rowMax(IVector& maxIds, Matrix& max);
void colMax(Matrix& max);
+ void colMax(IVector& maxIds, Matrix& max);
+ void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+ void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
void oneHotCrossEntropy(Matrix& output, IVector& label);
void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
@@ -1271,7 +1298,7 @@ class CpuMatrix : public Matrix {
void copyFrom(CpuSparseMatrix& src);
- void copyByRowIndex(Matrix& b, IVector& rowIndex);
+ void copyByRowIndex(Matrix& b, const IVector& rowIndex);
MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1425,6 +1452,9 @@ class CpuMatrix : public Matrix {
void rowMax(Matrix& max);
void rowMax(IVector& maxIds, Matrix& maxVal);
void colMax(Matrix& max);
+ void colMax(IVector& maxIds, Matrix& maxVal);
+ void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+ void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
void rowNormalizeL1(Matrix& out);
void oneHotCrossEntropy(Matrix& output, IVector& label);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 0b5de252258a96..6986624d25c7a4 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
std::vector& localIndices = indexDictHandle_->localIndices;
+ for (size_t i = 0; i < len; i ++) {
+ CHECK_LT(*(ids + i), this->getHeight())
+ << "id:" << *(ids + i) << "Height:" << this->getHeight()
+ << "sparse id value exceeds the max input dimension, "
+ << "it could be caused invalid input data samples";
+ }
localIndices.insert(localIndices.end(), ids, ids + len);
}
void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
CpuSparseMatrix* mat = dynamic_cast(input.get());
- CHECK(mat) << "only support non value sparse matrix";
+ CHECK(mat) << "only support sparse matrix";
addRows(reinterpret_cast(mat->getCols()),
mat->getElementCnt());
}
@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
int* index = ids->getData();
for (size_t i = 0; i < numSamples; ++i) {
if (index[i] == -1) continue;
- localIndices.push_back((unsigned int)index[i]);
+
+ unsigned int id = (unsigned int)index[i];
+ CHECK_LT(id, this->getHeight())
+ << "id:" << id << "Height:" << this->getHeight()
+ << "sparse id value exceeds the max input dimension, "
+ << "it could be caused invalid input data samples";
+ localIndices.push_back(id);
}
}
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 2ff19e7b3f87ca..2cc38b82306e2b 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) {
}
}
+void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+ int channels, int groups) {
+ int inWidth = imgSizeH * imgSizeW * channels;
+ int outChannels = channels / groups;
+ int outWidth = imgSizeH * imgSizeW * outChannels;
+
+ // forward
+ MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+ MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+ MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+ MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+ MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+ IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+ IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+ IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
+
+ input->randomizeUniform();
+ inputGpu->copyFrom(*input);
+
+ target->maxoutForward(*input, *id, outChannels, groups);
+ targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+
+ // check
+ targetCheck->copyFrom(*targetGpu);
+ MatrixCheckErr(*target, *targetCheck);
+ idCheck->copyFrom(*idGpu);
+ VectorCheckEqual(*id, *idCheck);
+
+ // backward
+ MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+ MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+ MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+ MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
+ true);
+ MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
+ false);
+
+ inputGrad->randomizeUniform();
+ targetGrad->randomizeUniform();
+ inputGpuGrad->copyFrom(*inputGrad);
+ targetGpuGrad->copyFrom(*targetGrad);
+
+ inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+ inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+
+ // check
+ targetCheckGrad->copyFrom(*inputGpuGrad);
+ MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Matrix, MaxOutFwdBwd) {
+ for (auto numSamples : {5, 10}) {
+ for (auto channels : {8, 16}) {
+ for (auto imgSizeH : {14, 28}) {
+ for (auto imgSizeW : {16, 30}) {
+ for (auto groups : {2, 4}) {
+ VLOG(3) << " numSamples=" << numSamples
+ << " channels=" << channels
+ << " imgSizeH=" << imgSizeH
+ << " imgSizeW=" << imgSizeW
+ << " groups=" << groups;
+ testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+ }
+ }
+ }
+ }
+ }
+}
+
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 2f9606dc680265..ff251fe89f9f88 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -146,6 +146,12 @@ class Parameter {
}
}
+ void enableBufType(ParameterType type) {
+ if (bufs_[type]) return;
+ bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+ bufs_[type]->zeroMem();
+ }
+
void enableIntType(ParameterType type, size_t intStoreSize = 0) {
if (!intBufs_[type]) {
SetDevice device(deviceId_);
diff --git a/paddle/pserver/PserverForPython.h b/paddle/pserver/PserverForPython.h
deleted file mode 100644
index 5bbeae8bd8b973..00000000000000
--- a/paddle/pserver/PserverForPython.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/pserver/ParameterClient.h"
-#include "paddle/pserver/ParameterServer.h"
-#include "paddle/parameter/Parameter.h"
-#include
-
-namespace paddle {
-
-struct PyObjectDeleter {
- void operator()(PyObject* obj) {
- if (obj) {
- Py_DECREF(obj);
- }
- }
-};
-
-class ParameterClientPy : public ParameterClient {
-protected:
- typedef std::unique_ptr PyObjectPtr;
-
- std::vector parameter_;
- int initArgc_;
- char** initArgv_;
-
-public:
- ParameterClientPy(std::vector configs, int argc,
- std::vector argv, bool useGpu) {
- initArgc_ = argc;
- initArgv_ = new char* [argc];
- for (int i = 0; i < argc; i++) {
- initArgv_[i] = new char[argv[i].size()];
- strcpy(initArgv_[i], // NOLINT
- argv[i].c_str()); // NOLINT TODO(yuyang18): use snprintf instead.
- }
- ParameterConfig pyConfig;
- ParameterPtr param;
- for (auto& config : configs) {
- pyConfig.ParseFromString(config);
- param.reset(new Parameter(pyConfig, useGpu));
- parameter_.push_back(param);
- }
- Py_Initialize();
- CHECK(Py_IsInitialized());
- }
-
- ~ParameterClientPy() {
- delete initArgv_;
- Py_Finalize();
- }
-
- Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
-
- void initClientPy() {
- initMain(initArgc_, initArgv_);
- CHECK(init(parameter_)) << "Init Client Failed.";
- }
-
- void setConfigPy(std::string config) {
- OptimizationConfig optConfig;
- optConfig.ParseFromString(config);
- setConfig(optConfig);
- }
-
- bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
-
- void setStatusPy(int status) { setStatus(PServerStatus(status)); }
-
- void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
-
- void sendParameterPy(int updateMode, int parameterType, int numSamples,
- real cost, bool sendBackParameter) {
- sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
- int64_t(numSamples), real(cost), sendBackParameter);
- }
-
- template
- std::string asyncCallPy(const char* serviceName, const char* funcName,
- const std::string in) {
- ProtoIn protoIn;
- ProtoOut protoOut;
- std::mutex waitLock;
- std::string data;
- protoIn.ParseFromString(in);
- waitLock.lock();
- auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
- if (isSuccessful) {
- pOut->SerializeToString(&data);
- } else {
- LOG(INFO) << "Async Talk Failed.";
- }
- waitLock.unlock();
- };
-
- ubClient_.asyncCall(serviceName, funcName, protoIn,
- &protoOut, callback);
- waitLock.lock();
- protoOut.SerializeToString(&data);
- return data;
- }
-};
-
-} // namespace paddle
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 0366bb636c704a..6d8f5da3e298fa 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -63,7 +63,8 @@ def __init__(self, input_type, pos):
def scan(self, dat):
self.extend_cols(dat)
- self.__rows__.append(len(dat))
+ self.__rows__.append(len(dat) + self.__rows__[-1])
+ self.__height__ += 1
def extend_cols(self, dat):
self.__cols__.extend(dat)
diff --git a/paddle/scripts/travis/before_install.sh b/paddle/scripts/travis/before_install.linux.sh
similarity index 100%
rename from paddle/scripts/travis/before_install.sh
rename to paddle/scripts/travis/before_install.linux.sh
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
new file mode 100755
index 00000000000000..f438e69b822aa4
--- /dev/null
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+brew update
+brew tap homebrew/science
+brew install python
+sudo pip install --upgrade protobuf==2.6.0
+brew install homebrew/versions/protobuf260 --without-python
+brew install cmake python glog gflags openblas wget md5sha1sum
+
+wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
+tar xf gtest.tar.gz
+cd googletest-release-1.8.0/
+cmake .
+make install
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 3ea633be327027..a73c32344c8abe 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,7 +1,22 @@
#!/bin/bash
source ./common.sh
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
-make -j `nproc`
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
+CMAKE_EXTRA=""
+if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+ CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+fi
+
+
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
+
+NPROC=1
+if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+ NRPOC=`nproc`
+elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+ NPROC=`sysctl -n hw.ncpu`
+fi
+
+
+make -j $NPROC
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
sudo make install
sudo paddle version
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index 91f7f4d29df938..d0fda1b6253e3e 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -20,6 +20,8 @@ limitations under the License. */
#include "paddle/math/SparseRowMatrix.h"
#include "paddle/utils/Thread.h"
+P_DECLARE_int32(trainer_count);
+
namespace paddle {
SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
@@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector& parameters) {
false /*inPserver*/));
size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
optimizers_[pid]->init(numRows, ¶->getConfig());
+ if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
+ // For trainer_count=1, the gradient machine is NeuralNetwork, which does
+ // not create parameter buf for PARAMETER_GRADIENT for sparse update in
+ // Parameter::enableType(). But gradient parameter buf is still used
+ // in SgdThreadUpdater. We need to explicitly create it.
+ para->enableBufType(PARAMETER_GRADIENT);
+ }
}
}
@@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse(
// From MultiGradientMachine
SparseRowIdsCpuMatrix* mainMat = dynamic_cast(
para->getMat(PARAMETER_GRADIENT).get());
- const std::vector& sparseIds = mainMat->getIds(tid);
+ std::vector& sparseIds = mainMat->getIds(tid);
for (auto id : sparseIds) {
// setup sub bufs
@@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse(
optimizer->update(vecs, para->getConfig(), id);
vecs[PARAMETER_GRADIENT]->zeroMem();
}
+ sparseIds.clear();
} else if (dynamic_cast(
para->getMat(PARAMETER_GRADIENT).get())) {
// From NeuralNetwork
@@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse(
optimizer->update(vecs, para->getConfig(), id);
vecs[PARAMETER_GRADIENT]->zeroMem();
}
+ // For numThreads > 1, MultiGradientMachine is used, which goes
+ // to the above branch.
+ CHECK_EQ(numThreads, 1UL);
+ mainMat->clearIndices();
} else {
auto & m = *para->getMat(PARAMETER_GRADIENT).get();
LOG(FATAL) << "Internal error: " << para->getName() << " "
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index 5d2e2ba9df5c71..664e18cb986811 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -13,157 +13,71 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-default_initial_std(0.5)
-
-model_type("nn")
-
-DataLayer(
- name = "input",
- size = 3,
-)
-
-DataLayer(
- name = "weight",
- size = 1,
-)
-
-Layer(
- name = "layer1_1",
- type = "fc",
- size = 5,
- active_type = "sigmoid",
- inputs = "input",
-)
-
-Layer(
- name = "layer1_2",
- type = "fc",
- size = 12,
- active_type = "linear",
- inputs = Input("input", parameter_name='sharew'),
-)
-
-Layer(
- name = "layer1_3",
- type = "fc",
- size = 3,
- active_type = "tanh",
- inputs = "input",
-)
-
-Layer(
- name = "layer1_5",
- type = "fc",
- size = 3,
- active_type = "tanh",
- inputs = Input("input",
- learning_rate=0.01,
- momentum=0.9,
- decay_rate=0.05,
- initial_mean=0.0,
- initial_std=0.01,
- format = "csc",
- nnz = 4)
-)
-
-FCLayer(
- name = "layer1_4",
- size = 5,
- active_type = "square",
- inputs = "input",
- drop_rate = 0.5,
-)
-
-Layer(
- name = "pool",
- type = "pool",
- inputs = Input("layer1_2",
- pool = Pool(pool_type="cudnn-avg-pool",
- channels = 1,
- size_x = 2,
- size_y = 3,
- img_width = 3,
- padding = 1,
- padding_y = 2,
- stride = 2,
- stride_y = 3))
-)
-
-Layer(
- name = "concat",
- type = "concat",
- inputs = ["layer1_3", "layer1_4"],
-)
-
-MixedLayer(
- name = "output",
- size = 3,
- active_type = "softmax",
- inputs = [
- FullMatrixProjection("layer1_1",
- learning_rate=0.1),
- TransposedFullMatrixProjection("layer1_2", parameter_name='sharew'),
- FullMatrixProjection("concat"),
- IdentityProjection("layer1_3"),
- ],
-)
-
-Layer(
- name = "label",
- type = "data",
- size = 1,
-)
-
-Layer(
- name = "cost",
- type = "multi-class-cross-entropy",
- inputs = ["output", "label", "weight"],
-)
-
-Layer(
- name = "cost2",
- type = "nce",
- num_classes = 3,
- active_type = "sigmoid",
- neg_sampling_dist = [0.1, 0.3, 0.6],
- inputs = ["layer1_2", "label", "weight"],
-)
-
-Evaluator(
- name = "error",
- type = "classification_error",
- inputs = ["output", "label", "weight"]
-)
-
-Inputs("input", "label", "weight")
-Outputs("cost", "cost2")
-
-TrainData(
- ProtoData(
- files = "dummy_list",
- constant_slots = [1.0],
- async_load_data = True,
- )
-)
-
-TestData(
- SimpleData(
- files = "trainer/tests/sample_filelist.txt",
- feat_dim = 3,
- context_len = 0,
- buffer_capacity = 1000000,
- async_load_data = False,
- ),
-)
-
-Settings(
- algorithm = "sgd",
- num_batches_per_send_parameter = 1,
- num_batches_per_get_parameter = 1,
- batch_size = 100,
- learning_rate = 0.001,
- learning_rate_decay_a = 1e-5,
- learning_rate_decay_b = 0.5,
-)
+from paddle.trainer_config_helpers import *
+
+TrainData(ProtoData(
+ files = "dummy_list",
+ constant_slots = [1.0],
+ async_load_data = True))
+
+TestData(SimpleData(
+ files = "trainer/tests/sample_filelist.txt",
+ feat_dim = 3,
+ context_len = 0,
+ buffer_capacity = 1000000,
+ async_load_data = False))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+wt = data_layer(name='weight', size=1)
+
+fc1 = fc_layer(input=data, size=5,
+ bias_attr=True,
+ act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=12,
+ bias_attr=True,
+ param_attr=ParamAttr(name='sharew'),
+ act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+ bias_attr=True,
+ act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+ bias_attr=True,
+ layer_attr=ExtraAttr(drop_rate=0.5),
+ act=SquareActivation())
+
+pool = img_pool_layer(input=fc2,
+ pool_size=2,
+ pool_size_y=3,
+ num_channels=1,
+ padding=1,
+ padding_y=2,
+ stride=2,
+ stride_y=3,
+ img_width=3,
+ pool_type=CudnnAvgPooling())
+
+concat = concat_layer(input=[fc3, fc4])
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as output:
+ output += full_matrix_projection(input=fc1)
+ output += trans_full_matrix_projection(input=fc2,
+ param_attr=ParamAttr(name='sharew'))
+ output += full_matrix_projection(input=concat)
+ output += identity_projection(input=fc3)
+
+lbl = data_layer(name='label', size=1)
+
+cost = classification_cost(input=output, label=lbl, weight=wt,
+ layer_attr=ExtraAttr(device=-1))
+
+nce = nce_layer(input=fc2, label=lbl, weight=wt,
+ num_classes=3,
+ neg_distribution=[0.1, 0.3, 0.6])
+
+outputs(cost, nce)
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
new file mode 100644
index 00000000000000..f2cfd7409412de
--- /dev/null
+++ b/paddle/utils/.gitignore
@@ -0,0 +1 @@
+enable_virtualenv.c
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 0557b01e36f078..45240b5002aa18 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,6 +2,9 @@
file(GLOB UTIL_HEADERS . *.h)
file(GLOB UTIL_SOURCES . *.cpp)
+create_resources(enable_virtualenv.py enable_virtualenv.c)
+set(UTIL_RES enable_virtualenv.c)
+
if(APPLE)
file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
else()
@@ -9,7 +12,8 @@ else()
endif()
add_library(paddle_utils STATIC
${UTIL_SOURCES}
- ${UTIL_ARCH_SOURCES})
+ ${UTIL_ARCH_SOURCES}
+ ${UTIL_RES})
add_style_check_target(paddle_utils ${UTIL_HEADERS})
add_style_check_target(paddle_utils ${UTIL_SOURCES}
${UTIL_ARCH_SOURCES})
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index b3f439804686fa..7fdfa3240c1de7 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int));
}
#endif // PADDLE_USE_GLOG
-#ifdef NDEBUG
+#ifndef NDEBUG
#define DEBUG_LEVEL 5
#define DBG VLOG(DEBUG_LEVEL)
#else
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 78c3a80674f9c1..90e5093f96ea4e 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex;
PyGuard::PyGuard() : guard_(g_pyMutex) {}
-static void printPyErrorStack(std::ostream& os, bool withEndl = false) {
+static void printPyErrorStack(std::ostream& os, bool withEndl = false,
+ bool withPyPath = true) {
PyObject * ptype, *pvalue, *ptraceback;
PyErr_Fetch(&ptype, &pvalue, &ptraceback);
PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
PyErr_Clear();
+ if (withPyPath) {
+ os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path")));
+ if (withEndl) {
+ os << std::endl;
+ }
+ }
PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
@@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
const std::string& funcName,
const std::vector& args) {
PyGuard guard;
- PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str()));
- CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName;
- PyObjectPtr pyModule(PyImport_Import(pyModuleName.get()));
- CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed.";
+ PyObjectPtr pyModule = py::import(moduleName);
PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
CHECK_PY(pyFunc) << "GetAttrString failed.";
PyObjectPtr pyArgs(PyTuple_New(args.size()));
@@ -143,7 +147,7 @@ PyObjectPtr createPythonClass(
const std::vector& args,
const std::map& kwargs) {
PyGuard guard;
- PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str()));
+ PyObjectPtr pyModule = py::import(moduleName);
LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
@@ -181,18 +185,29 @@ std::string getPyCallStack() {
printPyErrorStack(os, true);
return os.str();
}
+
+PyObjectPtr import(const std::string &moduleName) {
+ auto module = PyImport_ImportModule(moduleName.c_str());
+ CHECK_PY(module) << "Import " << moduleName << "Error";
+ return PyObjectPtr(module);
+}
+
} // namespace py
#endif
-
+extern "C" {
+extern const char enable_virtualenv_py[];
+}
void initPython(int argc, char** argv) {
#ifndef PADDLE_NO_PYTHON
Py_SetProgramName(argv[0]);
Py_Initialize();
PySys_SetArgv(argc, argv);
-
// python blocks SIGINT. Need to enable it.
signal(SIGINT, SIG_DFL);
+
+ // Manually activate virtualenv when user is using virtualenv
+ PyRun_SimpleString(enable_virtualenv_py);
#endif
}
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index db02d1252b4057..00fc177022ac34 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
namespace py {
+PyObjectPtr import(const std::string& moduleName);
+
/**
* Cast a PyLong or PyInt to int type T.
* @tparam T return type.
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index d73f27d7fafd6c..f952cf58778dee 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -135,6 +135,21 @@ class Queue {
queueCV_.wait(lock, [this]() { return numElements_ == 0; });
}
+ /**
+ * @brief wait queue is not empty at most for some seconds.
+ * @param seconds wait time limit.
+ * @return true if queue is not empty. false if timeout.
+ */
+ bool waitNotEmptyFor(int seconds) {
+ std::unique_lock lock(queueLock_);
+ return queueCV_.wait_for(
+ lock,
+ std::chrono::seconds(seconds),
+ [this] {
+ return numElements_ != 0;
+ });
+ }
+
private:
std::deque elements_;
int numElements_;
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py
new file mode 100644
index 00000000000000..99d822a4145cca
--- /dev/null
+++ b/paddle/utils/enable_virtualenv.py
@@ -0,0 +1,10 @@
+import os
+
+def __activate_virtual_env__():
+ __path__ = os.getenv('VIRTUAL_ENV')
+ if __path__ is None:
+ return
+ __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
+ execfile(__script__, {'__file__': __script__})
+
+__activate_virtual_env__()
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 8bdcd70a417b84..753fd0cac42233 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -170,6 +170,15 @@ message BlockExpandConfig {
required uint32 img_size_y = 11;
}
+message MaxOutConfig {
+ required uint32 channels = 1;
+ required uint32 groups = 2;
+
+ // The size of input feature map.
+ required uint32 img_size_x = 3;
+ required uint32 img_size_y = 4;
+}
+
message ProjectionConfig {
required string type = 1;
required string name = 2;
@@ -235,6 +244,7 @@ message LayerInputConfig {
// Set the argument name.
optional string input_layer_argument = 9;
optional BilinearInterpConfig bilinear_interp_conf = 10;
+ optional MaxOutConfig maxout_conf = 11;
}
message LayerConfig {
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 34f5dd41b7e683..53409b746d811a 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
calc_batch_size=None,
cache=CacheType.NO_CACHE,
check=False, check_fail_continue=False,
- use_dynamic_order=True,
init_hook=None, **kwargs):
"""
Provider decorator. Use it to make a function into PyDataProvider2 object.
@@ -228,9 +227,15 @@ def process(settings, file_name):
The configuration of data provider should be setup by\:
:param input_types: Specify the input types, can also be set in init_hook.
- It is a list of InputType object. For example, input_types= \
- [dense_vector(9), integer_value(2)].
- :type input_types: list|tuple
+ It could be a list of InputType object. For example,
+ input_types=[dense_vector(9), integer_value(2)]. Or user
+ can set a dict of InputType object, which key is
+ data_layer's name. For example, input_types=\
+ {'img': img_features, 'label': label}. when using dict of
+ InputType, user could yield a dict of feature values, which
+ key is also data_layer's name.
+
+ :type input_types: list|tuple|dict
:param should_shuffle: True if data should shuffle. Pass None means shuffle
when is training and not to shuffle when is testing.
@@ -281,12 +286,6 @@ def process(settings, file_name):
drop the wrong format data when it is True. Has
no effect when check set to False.
:type check_fail_continue: bool
-
- :param use_dynamic_order: Allow provider to yield a dictionary object, whose
- key is a input data layer name, and value is the
- feature value. The tuples are still allowed when
- use_dynmaic_order is True.
- :type use_dynamic_order: bool
"""
def __wrapper__(generator):
@@ -340,6 +339,11 @@ def __init__(self, file_list, **kwargs):
assert self.slots is not None
assert self.generator is not None
+ use_dynamic_order = False
+ if isinstance(self.slots, dict): # reorder input_types
+ self.slots = [self.slots[ipt] for ipt in self.input_order]
+ use_dynamic_order = True
+
if len(self.slots) == 1:
self.generator = SingleSlotWrapper(self.generator)
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 82446e980d81cc..c6cd4f62b91c9a 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -216,6 +216,10 @@ def Inputs(*args):
if g_current_submodel is g_root_submodel:
g_config.model_config.input_layer_names.append(name)
+@config_func
+def HasInputsSet():
+ return len(g_config.model_config.input_layer_names) != 0
+
# Define the name of the output layers of the NeuralNetwork.
# Usually the output is simply the cost layer.
@@ -466,6 +470,7 @@ def __init__(
pool=None,
image=None,
block_expand=None,
+ maxout=None,
format=None,
nnz=None,
is_static=None,
@@ -794,6 +799,16 @@ def __init__(
output_y = 0):
self.add_keys(locals())
+@config_class
+class MaxOut(Cfg):
+ def __init__(
+ self,
+ channels,
+ groups,
+ img_size_x = 0,
+ img_size_y = 0):
+ self.add_keys(locals())
+
def DataBase(async_load_data=False,
constant_slots=None,
data_ratio=1,
@@ -1098,6 +1113,12 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \
- block_expand.block_y) / float(block_expand.stride_y)))
+def parse_maxout(maxout, input_layer_name, maxout_conf):
+ maxout_conf.channels = maxout.channels
+ maxout_conf.groups = maxout.groups
+ maxout_conf.img_size_x = maxout.img_size_x
+ maxout_conf.img_size_y = maxout.img_size_y
+
# Define an evaluator
@config_func
def Evaluator(
@@ -1721,6 +1742,21 @@ def __init__(
self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y
* block_expand_conf.channels)
+@config_layer('maxout')
+class MaxOutLayer(LayerBase):
+ def __init__(
+ self,
+ name,
+ inputs,
+ **xargs):
+ super(MaxOutLayer, self).__init__(name, 'maxout', 0, inputs=inputs, **xargs)
+ input_layer = self.get_input_layer(0)
+ parse_maxout(self.inputs[0].maxout,
+ input_layer.name,
+ self.config.inputs[0].maxout_conf)
+ maxout_conf = self.config.inputs[0].maxout_conf
+ self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups)
+
# key: cost type
# value: cost class
g_cost_map = {}
@@ -1735,7 +1771,6 @@ def init(cls, name, inputs, device=None, coeff=1.):
g_cost_map[cost_type] = cls
define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
-define_cost('ClassificationErrorLayer', 'classification_error')
define_cost('RankingCost', 'rank-cost')
define_cost('AucValidation', 'auc-validation')
define_cost('PnpairValidation', 'pnpair-validation')
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 8ada3903dc06be..f51140656d0dcf 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -68,7 +68,7 @@ def define_py_data_source(file_list, cls, module,
file_list_name = 'train.list'
if isinstance(cls, TestData):
file_list_name = 'test.list'
- with open(file_list_name, 'r') as f:
+ with open(file_list_name, 'w') as f:
f.writelines(file_list)
file_list = file_list_name
@@ -84,6 +84,7 @@ def py_data2(files, load_data_module, load_data_object, load_data_args,
data.load_data_module = load_data_module
data.load_data_object = load_data_object
data.load_data_args = load_data_args
+ data.async_load_data = True
return data
data_cls = py_data2
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 59df4646faae98..8d249b140e8cde 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -50,11 +50,12 @@
'slope_intercept_layer', 'trans_full_matrix_projection',
'linear_comb_layer',
'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
+ 'nce_layer',
'cross_entropy_with_selfnorm', 'cross_entropy',
'multi_binary_label_cross_entropy',
'rank_cost', 'lambda_cost', 'huber_cost',
# 'block_expand_layer', # TODO(yuyang18): this layer is not correct
- 'out_prod_layer', 'print_layer'
+ 'maxout_layer', 'out_prod_layer', 'print_layer'
]
@@ -110,12 +111,14 @@ class LayerType(object):
SLOPE_INTERCEPT_LAYER = "slope_intercept"
LINEAR_COMBINATION_LAYER = "convex_comb"
BLOCK_EXPAND = "blockexpand"
+ MAXOUT = "maxout"
PRINT_LAYER = "print"
CTC_LAYER = "ctc"
CRF_LAYER = "crf"
CRF_DECODING_LAYER = "crf_decoding"
+ NCE_LAYER = 'nce'
RANK_COST = "rank-cost"
LAMBDA_COST = "lambda_cost"
@@ -169,7 +172,7 @@ class LayerOutput(object):
:param activation: Layer Activation.
:type activation: BaseActivation.
:param parents: Layer's parents.
- :type parents: list|tuple|collection.Sequence
+ :type parents: list|tuple|collections.Sequence
"""
def __init__(self, name, layer_type, parents=None, activation=None,
@@ -1692,7 +1695,7 @@ def img_conv_layer(input, filter_size, num_filters,
@layer_support()
def img_pool_layer(input, pool_size, name=None,
num_channels=None, pool_type=None,
- stride=1, start=None, padding=0, layer_attr=None,
+ stride=1, padding=0, layer_attr=None,
pool_size_y=None, stride_y=None, padding_y=None,
img_width=None):
"""
@@ -1723,8 +1726,6 @@ def img_pool_layer(input, pool_size, name=None,
:type stride: int
:param stride_y: stride height of pooling. It is equal to stride by default.
:type stride_y: int|None
- :param start: start position of pooling operation. Note it is deprecated now.
- :type start: int|None
:param layer_attr: Extra Layer attribute.
:type layer_attr: ExtraLayerAttribute
:param img_width: the width of input feature map. If it is None, the input feature
@@ -1758,7 +1759,7 @@ def img_pool_layer(input, pool_size, name=None,
pool_type=type_name,
channels=num_channels,
size_x=pool_size,
- start=start,
+ start=None,
stride=stride,
padding=padding,
size_y=pool_size_y,
@@ -2053,10 +2054,16 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
Concat all input vector into one huge vector.
Inputs can be list of LayerOutput or list of projection.
+ The example usage is:
+
+ .. code-block:: python
+
+ concat = concat_layer(input=[layer1, layer2])
+
:param name: Layer name.
:type name: basestring
:param input: input layers or projections
- :type input: list|tuple|collection.Sequence
+ :type input: list|tuple|collections.Sequence
:param act: Activation type.
:type act: BaseActivation
:param layer_attr: Extra Layer Attribute.
@@ -2842,30 +2849,52 @@ def __real_step__(*args):
return tmp
+def __cost_input__(input, label, weight=None):
+ """
+ inputs and parents for cost layers.
+ """
+ ipts = [Input(input.name), Input(label.name)]
+ parents = [input, label]
+ if weight is not None:
+ assert weight.layer_type == LayerType.DATA
+ ipts.append(Input(weight.name))
+ parents.append(weight)
+ return ipts, parents
+
@wrap_name_default()
-def regression_cost(input, label, cost='square_error', name=None):
+@layer_support()
+def regression_cost(input, label, weight=None, name=None,
+ layer_attr=None):
"""
Regression Layer.
TODO(yuyang18): Complete this method.
:param name: layer name.
+ :type name: basestring
:param input: Network prediction.
+ :type input: LayerOutput
:param label: Data label.
- :param cost: Cost method.
+ :type label: LayerOutput
+ :param weight: The weight affects the cost, namely the scale of cost.
+ It is an optional argument.
+ :type weight: LayerOutput
+ :param layer_attr: layer's extra attribute.
+ :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
+ :rtype: LayerOutput
"""
- Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name)
- return LayerOutput(
- name, LayerType.COST, parents=[input, label]
- )
+ ipts, parents = __cost_input__(input, label, weight)
+
+ Layer(inputs=ipts, type="square_error", name=name,
+ **ExtraLayerAttribute.to_kwargs(layer_attr))
+ return LayerOutput(name, LayerType.COST, parents=parents)
@wrap_name_default("cost")
@layer_support()
-def classification_cost(input, label, name=None,
- cost="multi-class-cross-entropy",
+def classification_cost(input, label, weight=None, name=None,
evaluator=classification_error_evaluator,
layer_attr=None):
"""
@@ -2877,8 +2906,9 @@ def classification_cost(input, label, name=None,
:type input: LayerOutput
:param label: label layer name. data_layer often.
:type label: LayerOutput
- :param cost: cost method.
- :type cost: basestring
+ :param weight: The weight affects the cost, namely the scale of cost.
+ It is an optional argument.
+ :type weight: LayerOutput
:param evaluator: Evaluator method.
:param layer_attr: layer's extra attribute.
:type layer_attr: ExtraLayerAttribute
@@ -2888,7 +2918,10 @@ def classification_cost(input, label, name=None,
assert input.layer_type != LayerType.DATA
assert isinstance(input.activation, SoftmaxActivation)
assert label.layer_type == LayerType.DATA
- Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)],
+
+ ipts, parents = __cost_input__(input, label, weight)
+
+ Layer(name=name, type="multi-class-cross-entropy", inputs=ipts,
**ExtraLayerAttribute.to_kwargs(layer_attr))
def __add_evaluator__(e):
@@ -2900,7 +2933,7 @@ def __add_evaluator__(e):
assert isinstance(e.for_classification, bool)
assert e.for_classification
- e(name=e.__name__, input=input, label=label)
+ e(name=e.__name__, input=input, label=label, weight=weight)
if not isinstance(evaluator, collections.Sequence):
evaluator = [evaluator]
@@ -2908,7 +2941,7 @@ def __add_evaluator__(e):
for each_evaluator in evaluator:
__add_evaluator__(each_evaluator)
- return LayerOutput(name, LayerType.COST, parents=[input, label])
+ return LayerOutput(name, LayerType.COST, parents=parents)
def conv_operator(img, filter, filter_size, num_filters,
@@ -2984,7 +3017,8 @@ def conv_operator(img, filter, filter_size, num_filters,
@wrap_name_default()
-def conv_shift_layer(a, b, name=None):
+@layer_support()
+def conv_shift_layer(a, b, name=None, layer_attr=None):
"""
This layer performs cyclic convolution for two input. For example:
- a[in]: contains M elements.
@@ -3013,6 +3047,8 @@ def conv_shift_layer(a, b, name=None):
:type a: LayerOutput
:param b: input layer b
:type b: LayerOutput
+ :param layer_attr: layer's extra attribute.
+ :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3022,6 +3058,7 @@ def conv_shift_layer(a, b, name=None):
name=name,
type=LayerType.CONV_SHIFT_LAYER,
inputs=[a.name, b.name],
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b],
@@ -3095,6 +3132,7 @@ def tensor_layer(a, b, size, act=None, name=None,
@wrap_param_attr_default()
@wrap_bias_attr_default()
@wrap_act_default()
+@layer_support()
def selective_fc_layer(input, select, size, act=None, name=None,
pass_generation=False,
has_selected_colums=True,
@@ -3167,7 +3205,8 @@ def selective_fc_layer(input, select, size, act=None, name=None,
@wrap_name_default()
-def sampling_id_layer(input, name=None):
+@layer_support()
+def sampling_id_layer(input, name=None, layer_attr=None):
"""
A layer for sampling id from multinomial distribution from the input layer.
Sampling one id for one sample.
@@ -3182,6 +3221,8 @@ def sampling_id_layer(input, name=None):
:type input: LayerOutput
:param name: The Layer Name.
:type name: basestring
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3189,12 +3230,15 @@ def sampling_id_layer(input, name=None):
name=name,
type=LayerType.SAMPLING_ID_LAYER,
inputs=[Input(input.name)],
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
@wrap_name_default()
-def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
+@layer_support()
+def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0,
+ layer_attr=None):
"""
This layer for applying a slope and an intercept to the input
element-wise. There is no activation and weight.
@@ -3216,6 +3260,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
:type slope: float.
:param intercept: the offset.
:type intercept: float.
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3225,12 +3271,15 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
slope=slope,
intercept=intercept,
inputs=[Input(input.name)],
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
@wrap_name_default()
-def linear_comb_layer(weights, vectors, size=None, name=None):
+@layer_support()
+def linear_comb_layer(weights, vectors, size=None, name=None,
+ layer_attr=None):
"""
A layer for weighted sum of vectors takes two inputs.
- Input: size of weights is M
@@ -3271,6 +3320,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
:type size: int
:param name: The Layer Name.
:type name: basestring
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3286,6 +3337,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
type=LayerType.LINEAR_COMBINATION_LAYER,
size=size,
inputs=[Input(weights.name), Input(vectors.name)],
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
[weights, vectors], size=size)
@@ -3295,6 +3347,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
@wrap_name_default()
+@layer_support()
def block_expand_layer(input,
channel=0,
block_x=0,
@@ -3303,7 +3356,8 @@ def block_expand_layer(input,
stride_y=0,
padding_x=0,
padding_y=0,
- name=None):
+ name=None,
+ layer_attr=None):
"""
Expand feature map to minibatch matrix.
- matrix width is: block_y * block_x * channel
@@ -3350,6 +3404,8 @@ def block_expand_layer(input,
:type padding_y: int
:param name: The name of this layer, which can not specify.
:type name: None|basestring.
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3364,13 +3420,83 @@ def block_expand_layer(input,
padding_y=padding_y)
),
type=LayerType.BLOCK_EXPAND,
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
@wrap_name_default()
-def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
+@layer_support()
+def maxout_layer(input,
+ groups,
+ num_channels=None,
+ size_x=None,
+ size_y=None,
+ name=None,
+ layer_attr=None):
+ """
+ A layer to do max out on conv layer output.
+ - Input: output of a conv layer.
+ - Output: feature map size same as input. Channel is (input channel) / groups.
+
+ So groups should be larger than 1, and the num of channels should be able
+ to devided by groups.
+
+ Please refer to Paper:
+ - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+ - Multi-digit Number Recognition from Street View \
+ Imagery using Deep Convolutional Neural Networks: \
+ https://arxiv.org/pdf/1312.6082v4.pdf
+
+ The simple usage is:
+
+ .. code-block:: python
+
+ maxout = maxout_layer(input,
+ num_channels=128,
+ groups=4)
+
+ :param input: The input layer.
+ :type input: LayerOutput
+ :param num_channels: The channel number of input layer. If None will be set
+ automatically from previous output.
+ :type num_channels: int|None
+ :param groups: The group number of input layer.
+ :type groups: int
+ :param size_x: conv output width. If None will be set
+ automatically from previous output.
+ :type size_x: int|None
+ :param size_y: conv output height. If None will be set
+ automatically from previous output.
+ :type size_y: int|None
+ :param name: The name of this layer, which can not specify.
+ :type name: None|basestring.
+ :param layer_attr: Extra Layer attribute.
+ :type layer_attr: ExtraLayerAttribute
+ :return: LayerOutput object.
+ :rtype: LayerOutput
+ """
+ assert input.layer_type == LayerType.CONV_LAYER
+ assert isinstance(input.activation, LinearActivation)
+ assert groups > 1
+ if num_channels is None:
+ assert input.num_filters is not None
+ num_channels = input.num_filters
+ assert num_channels % groups == 0
+ Layer(name=name,
+ inputs=Input(input.name,
+ maxout=MaxOut(channels=num_channels,
+ groups=groups)),
+ type=LayerType.MAXOUT,
+ **ExtraLayerAttribute.to_kwargs(layer_attr))
+ return LayerOutput(name, LayerType.MAXOUT, parents=[input])
+
+
+@wrap_name_default()
+@layer_support()
+def ctc_layer(input, label, size=None, name=None, norm_by_times=False,
+ layer_attr=None):
"""
Connectionist Temporal Classification (CTC) is designed for temporal
classication task. That is, for sequence labeling problems where the
@@ -3407,6 +3533,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
:type name: basestring|None
:param norm_by_times: Whether to normalization by times. False by default.
:type norm_by_times: bool
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3422,14 +3550,17 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
type=LayerType.CTC_LAYER,
size=size,
norm_by_times=norm_by_times,
- inputs=[input.name, label.name]
+ inputs=[input.name, label.name],
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
@wrap_name_default()
@wrap_param_attr_default()
-def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
+@layer_support()
+def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None,
+ layer_attr=None):
"""
A layer for calculating the cost of sequential conditional random
field model.
@@ -3455,6 +3586,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
:type param_attr: ParameterAttribute
:param name: The name of this layers. It is not necessary.
:type name: None|basestring
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3478,6 +3611,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
type=LayerType.CRF_LAYER,
size=size,
inputs=ipts,
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
parents = [input, label]
if weight is not None:
@@ -3487,7 +3621,9 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
@wrap_name_default()
@wrap_param_attr_default()
-def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
+@layer_support()
+def crf_decoding_layer(input, size, label=None, param_attr=None, name=None,
+ layer_attr=None):
"""
A layer for calculating the decoding sequence of sequential conditional
random field model. The decoding sequence is stored in output.ids.
@@ -3505,6 +3641,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
:type param_attr: ParameterAttribute
:param name: The name of this layers. It is not necessary.
:type name: None|basestring
+ :param layer_attr: Extra Layer config.
+ :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
@@ -3521,12 +3659,90 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
type=LayerType.CRF_DECODING_LAYER,
size=size,
inputs=ipts,
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
)
parents = [input]
if label is not None:
parents.append(label)
return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
+@wrap_bias_attr_default(has_bias=True)
+@wrap_name_default()
+@layer_support()
+def nce_layer(input, label, num_classes, weight=None,
+ num_neg_samples=10, neg_distribution=None,
+ name=None, bias_attr=None, layer_attr=None):
+ """
+ Noise-contrastive estimation.
+ Implements the method in the following paper:
+ A fast and simple algorithm for training neural probabilistic language models.
+
+ The example usage is:
+
+ .. code-block:: python
+
+ cost = nce_layer(input=layer1, label=layer2, weight=layer3,
+ num_classes=3, neg_distribution=[0.1,0.3,0.6])
+
+ :param name: layer name
+ :type name: basestring
+ :param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput.
+ :type input: LayerOutput|list|tuple|collections.Sequence
+ :param label: label layer
+ :type label: LayerOutput
+ :param weight: weight layer, can be None(default)
+ :type weight: LayerOutput
+ :param num_classes: number of classes.
+ :type num_classes: int
+ :param num_neg_samples: number of negative samples. Default is 10.
+ :type num_neg_samples: int
+ :param neg_distribution: The distribution for generating the random negative labels.
+ A uniform distribution will be used if not provided.
+ If not None, its length must be equal to num_classes.
+ :type neg_distribution: list|tuple|collections.Sequence|None
+ :param bias_attr: Bias parameter attribute. True if no bias.
+ :type bias_attr: ParameterAttribute|None|False
+ :param layer_attr: Extra Layer Attribute.
+ :type layer_attr: ExtraLayerAttribute
+ :return: layer name.
+ :rtype: LayerOutput
+ """
+ if isinstance(input, LayerOutput):
+ input = [input]
+ assert isinstance(input, collections.Sequence)
+ assert isinstance(label, LayerOutput)
+ assert label.layer_type == LayerType.DATA
+ if neg_distribution is not None:
+ assert isinstance(neg_distribution, collections.Sequence)
+ assert len(neg_distribution) == num_classes
+ assert sum(neg_distribution) == 1
+
+ ipts_for_layer = []
+ parents = []
+ for each_input in input:
+ assert isinstance(each_input, LayerOutput)
+ ipts_for_layer.append(each_input.name)
+ parents.append(each_input)
+ ipts_for_layer.append(label.name)
+ parents.append(label)
+
+ if weight is not None:
+ assert isinstance(weight, LayerOutput)
+ assert weight.layer_type == LayerType.DATA
+ ipts_for_layer.append(weight.name)
+ parents.append(weight)
+
+ Layer(
+ name=name,
+ type=LayerType.NCE_LAYER,
+ num_classes=num_classes,
+ neg_sampling_dist=neg_distribution,
+ num_neg_samples=num_neg_samples,
+ inputs=ipts_for_layer,
+ bias=ParamAttr.to_bias(bias_attr),
+ **ExtraLayerAttribute.to_kwargs(layer_attr)
+ )
+ return LayerOutput(name, LayerType.NCE_LAYER, parents=parents)
"""
following are cost Layers.
@@ -3534,7 +3750,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
@wrap_name_default()
-def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
+@layer_support()
+def rank_cost(left, right, label, weight=None, name=None, coeff=1.0, layer_attr=None):
"""
A cost Layer for learning to rank using gradient descent. Details can refer
to `papers 0
+
+ if HasInputsSet(): # input already set
+ Outputs(*[l.name for l in layers])
+ return # just return outputs.
+
if len(layers) != 1:
logger.warning("`outputs` routine try to calculate network's"
" inputs and outputs order. It might not work well."
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index 4660a6b5003daf..d4b947517b7d04 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -362,6 +362,13 @@ def __extends__(dict1, dict2):
default_factory=lambda _: BaseRegularization())
def settings(batch_size,
learning_rate=1e-3,
+ learning_rate_decay_a=0.,
+ learning_rate_decay_b=0.,
+ learning_rate_schedule='poly',
+ learning_rate_args='',
+ average_window=0,
+ do_average_in_cpu=False,
+ max_average_window=None,
learning_method=None,
regularization=None,
is_async=False,
@@ -408,10 +415,14 @@ def settings(batch_size,
else:
algorithm = 'owlqn'
+ args=['batch_size', 'learning_rate', 'learning_rate_decay_a',
+ 'learning_rate_decay_b', 'learning_rate_schedule',
+ 'learning_rate_args', 'average_window', 'do_average_in_cpu',
+ 'max_average_window']
kwargs = dict()
- kwargs['batch_size'] = batch_size
- kwargs['learning_rate'] = learning_rate
kwargs['algorithm'] = algorithm
+ for arg in args:
+ kwargs[arg] = locals()[arg]
kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
learning_method.extra_settings()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
index 359652f3d09c7f..88ce5c129e552e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@@ -2,13 +2,17 @@
a5d9259ff1fd7ca23d0ef090052cb1f2 last_first_seq.protostr
9c038249ec8ff719753a746cdb04c026 layer_activations.protostr
5913f87b39cee3b2701fa158270aca26 projections.protostr
+7334ba0a4544f0623231330fc51d390d shared_fc.protostr
+8b8b6bb128a7dfcc937be86145f53e2f shared_lstm.protostr
6b39e34beea8dfb782bee9bd3dea9eb5 simple_rnn_layers.protostr
0fc1409600f1a3301da994ab9d28b0bf test_cost_layers.protostr
+6cd5f28a3416344f20120698470e0a4c test_cost_layers_with_weight.protostr
144bc6d3a509de74115fa623741797ed test_expand_layer.protostr
2378518bdb71e8c6e888b1842923df58 test_fc.protostr
8bb44e1e5072d0c261572307e7672bda test_grumemory_layer.protostr
1f3510672dce7a9ed25317fc58579ac7 test_hsigmoid.protostr
d350bd91a0dc13e854b1364c3d9339c6 test_lstmemory_layer.protostr
+6fa59551808ee7012bbd24f757e782d2 test_maxout.protostr
251a948ba41c1071afcd3d9cf9c233f7 test_ntm_layers.protostr
e6ff04e70aea27c7b06d808cc49c9497 test_print_layer.protostr
2a75dd33b640c49a8821c2da6e574577 test_rnn_group.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index e8be0023e70134..15c66a9754604c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -8,8 +8,8 @@ configs=(test_fc layer_activations projections test_print_layer
test_sequence_pooling test_lstmemory_layer test_grumemory_layer
last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group test_bilinear_interp)
-
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_bilinear_interp test_maxout)
for conf in ${configs[*]}
do
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
new file mode 100644
index 00000000000000..202cf367fc7f28
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
@@ -0,0 +1,22 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+ learning_rate=1e-4,
+ batch_size=1000
+)
+
+a = data_layer(name='feature_a', size=200)
+b = data_layer(name='feature_b', size=200)
+
+fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
+bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
+
+softmax_param = ParamAttr(name='softmax_param', initial_max=1.0, initial_min=-1.0)
+
+hidden_a = fc_layer(input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
+hidden_b = fc_layer(input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
+
+predict = fc_layer(input=[hidden_a, hidden_b], param_attr=[softmax_param, softmax_param],
+ bias_attr=False, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
new file mode 100644
index 00000000000000..8557e9daaf66ad
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -0,0 +1,29 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+
+mixed_param = ParamAttr(name='mixed_param')
+
+with mixed_layer(size=400, bias_attr=False) as m1:
+ m1 += full_matrix_projection(input=data_1, param_attr=mixed_param)
+
+with mixed_layer(size=400, bias_attr=False) as m2:
+ m2 += full_matrix_projection(input=data_2, param_attr=mixed_param)
+
+lstm_param = ParamAttr(name='lstm_param')
+lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
+
+lstm1 = lstmemory_group(input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+lstm2 = lstmemory_group(input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+
+softmax_param = ParamAttr(name='softmax_param')
+
+predict = fc_layer(input=[last_seq(input=lstm1), last_seq(input=lstm2)],
+ size=10,
+ param_attr=[softmax_param, softmax_param],
+ bias_attr=False,
+ act=SoftmaxActivation())
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
new file mode 100644
index 00000000000000..29749cbb666379
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -0,0 +1,14 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+ learning_rate=1e-4,
+ batch_size=1000
+)
+
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=1)
+wt = data_layer(name='weight', size=1)
+fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=fc, label=lbl, weight=wt),
+ regression_cost(input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
new file mode 100644
index 00000000000000..079e2cf4c43206
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -0,0 +1,30 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+ batch_size=1000,
+ learning_rate=1e-5
+)
+
+data = data_layer(name='data', size=2304)
+
+conv = img_conv_layer(input=data,
+ filter_size = 3,
+ num_channels=1,
+ num_filters=16,
+ padding=1,
+ act=LinearActivation(),
+ bias_attr=True)
+
+maxout = maxout_layer(input=conv,
+ num_channels=16,
+ groups=2)
+
+pool = img_pool_layer(input=maxout,
+ num_channels=8,
+ pool_size=2,
+ stride=2,
+ pool_type=MaxPooling())
+
+fc = fc_layer(input=pool, size=384, bias_attr=False)
+
+outputs(fc)