diff --git a/.gitignore b/.gitignore index 7e21ba0b750dfc..65ba217de37c82 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,6 @@ build/ *.user .vscode -.idea \ No newline at end of file +.idea +.project +.pydevproject diff --git a/.travis.yml b/.travis.yml index d3dae9efd416bd..bf0e0b7bbddd4c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,17 @@ language: cpp cache: ccache sudo: required dist: trusty +os: + - linux + - osx env: - JOB=DOCS - JOB=BUILD_AND_TEST +matrix: + exclude: + - os: osx + env: JOB=DOCS # Only generate documentation in linux + addons: apt: packages: @@ -27,9 +35,11 @@ addons: - libgoogle-glog-dev - libgflags-dev - libgtest-dev + - graphviz before_install: + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi - pip install wheel protobuf sphinx breathe recommonmark - - sudo paddle/scripts/travis/before_install.sh script: - paddle/scripts/travis/main.sh notifications: diff --git a/CMakeLists.txt b/CMakeLists.txt index 44e93f22c0eaf4..4613155f7700b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8) project(paddle CXX C) set(PADDLE_MAJOR_VERSION 0) set(PADDLE_MINOR_VERSION 8) -set(PADDLE_PATCH_VERSION 0b1) +set(PADDLE_PATCH_VERSION 0b2) set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") @@ -104,7 +104,7 @@ else() endif(NOT WITH_GPU) if(WITH_DOUBLE) - add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE) + add_definitions(-DPADDLE_TYPE_DOUBLE) set(ACCURACY double) else(WITH_DOUBLE) set(ACCURACY float) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 529b4b9d15d097..57c32a54cd727e 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -17,10 +17,17 @@ ## Find MKL First. set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL") -find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include) -find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib) -find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib) -find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib) +find_path(MKL_INCLUDE_DIR mkl.h PATHS + ${MKL_ROOT}/include) +find_library(MKL_CORE_LIB NAMES mkl_core PATHS + ${MKL_ROOT}/lib + ${MKL_ROOT}/lib/intel64) +find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS + ${MKL_ROOT}/lib + ${MKL_ROOT}/lib/intel64) +find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS + ${MKL_ROOT}/lib + ${MKL_ROOT}/lib/intel64) if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index cc59309ee7efab..dbad6be3f41b3f 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -64,7 +64,9 @@ set(COMMON_FLAGS -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-error=literal-suffix - -Wno-error=unused-local-typedefs) + -Wno-error=unused-local-typedefs + -Wno-error=unused-function # Warnings in Numpy Header. +) foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) diff --git a/cmake/util.cmake b/cmake/util.cmake index d776c3ae499526..0fa36f070cc11b 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME) cuda_add_library(${TARGET_NAME} STATIC ${ARGN}) set(CUDA_NVCC_FLAGS ${NVCC_FLAG}) endmacro() + + +# Creates C resources file from files in given resource file +function(create_resources res_file output) + # Create empty output file + file(WRITE ${output} "") + # Get short filename + string(REGEX MATCH "([^/]+)$" filename ${res_file}) + # Replace filename spaces & extension separator for C compatibility + string(REGEX REPLACE "\\.| |-" "_" filename ${filename}) + # Read hex data from file + file(READ ${res_file} filedata HEX) + # Convert hex data for C compatibility + string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata}) + # Append data to output file + file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n") +endfunction() diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore new file mode 100644 index 00000000000000..810910fd5ca56f --- /dev/null +++ b/demo/mnist/.gitignore @@ -0,0 +1,6 @@ +data/raw_data +data/*.list +mnist_vgg_model +plot.png +train.log +*pyc diff --git a/demo/mnist/data/generate_list.py b/demo/mnist/data/generate_list.py new file mode 100644 index 00000000000000..1b929048b4d82b --- /dev/null +++ b/demo/mnist/data/generate_list.py @@ -0,0 +1,21 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +o = open("./" + "train.list", "w") +o.write("./data/raw_data/train" +"\n") +o.close() + +o = open("./" + "test.list", "w") +o.write("./data/raw_data/t10k" +"\n") +o.close() \ No newline at end of file diff --git a/demo/mnist/data/get_mnist_data.sh b/demo/mnist/data/get_mnist_data.sh new file mode 100755 index 00000000000000..9099b5ab6fb85d --- /dev/null +++ b/demo/mnist/data/get_mnist_data.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env sh +# This scripts downloads the mnist data and unzips it. +set -e +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +rm -rf "$DIR/raw_data" +mkdir "$DIR/raw_data" +cd "$DIR/raw_data" + +echo "Downloading..." + +for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte +do + if [ ! -e $fname ]; then + wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz + gunzip ${fname}.gz + fi +done + +cd $DIR +rm -f *.list +python generate_list.py + diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py new file mode 100644 index 00000000000000..32af29730a7365 --- /dev/null +++ b/demo/mnist/mnist_provider.py @@ -0,0 +1,32 @@ +from paddle.trainer.PyDataProvider2 import * + + +# Define a py data provider +@provider(input_types={ + 'pixel': dense_vector(28 * 28), + 'label': integer_value(10) +}) +def process(settings, filename): # settings is not used currently. + imgf = filename + "-images-idx3-ubyte" + labelf = filename + "-labels-idx1-ubyte" + f = open(imgf, "rb") + l = open(labelf, "rb") + + f.read(16) + l.read(8) + + # Define number of samples for train/test + if "train" in filename: + n = 60000 + else: + n = 10000 + + for i in range(n): + label = ord(l.read(1)) + pixels = [] + for j in range(28 * 28): + pixels.append(float(ord(f.read(1))) / 255.0) + yield {"pixel": pixels, 'label': label} + + f.close() + l.close() diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh new file mode 100755 index 00000000000000..084b32ac390b84 --- /dev/null +++ b/demo/mnist/train.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +config=vgg_16_mnist.py +output=./mnist_vgg_model +log=train.log + +paddle train \ +--config=$config \ +--dot_period=10 \ +--log_period=100 \ +--test_all_data_in_one_period=1 \ +--use_gpu=0 \ +--trainer_count=1 \ +--num_passes=100 \ +--save_dir=$output \ +2>&1 | tee $log + +python -m paddle.utils.plotcurve -i $log > plot.png diff --git a/demo/mnist/vgg_16_mnist.py b/demo/mnist/vgg_16_mnist.py new file mode 100644 index 00000000000000..45a45bb061aa78 --- /dev/null +++ b/demo/mnist/vgg_16_mnist.py @@ -0,0 +1,53 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +is_predict = get_config_arg("is_predict", bool, False) + +####################Data Configuration ################## + + +if not is_predict: + data_dir='./data/' + define_py_data_sources2(train_list= data_dir + 'train.list', + test_list= data_dir + 'test.list', + module='mnist_provider', + obj='process') + +######################Algorithm Configuration ############# +settings( + batch_size = 128, + learning_rate = 0.1 / 128.0, + learning_method = MomentumOptimizer(0.9), + regularization = L2Regularization(0.0005 * 128) +) + +#######################Network Configuration ############# + +data_size=1*28*28 +label_size=10 +img = data_layer(name='pixel', size=data_size) + +# small_vgg is predined in trainer_config_helpers.network +predict = small_vgg(input_image=img, + num_channels=1, + num_classes=label_size) + +if not is_predict: + lbl = data_layer(name="label", size=label_size) + inputs(img, lbl) + outputs(classification_cost(input=predict, label=lbl)) +else: + outputs(predict) diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh index fb2bee98beb268..fe2acbbd74898f 100755 --- a/demo/quick_start/preprocess.sh +++ b/demo/quick_start/preprocess.sh @@ -20,6 +20,8 @@ set -e +export LC_ALL=C + mkdir -p data/tmp python preprocess.py -i data/reviews_Electronics_5.json.gz # uniq and shuffle diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh index 1f0a137c8bd594..ea4e32249a3d01 100755 --- a/demo/quick_start/train.sh +++ b/demo/quick_start/train.sh @@ -18,6 +18,8 @@ cfg=trainer_config.lr.py #cfg=trainer_config.emb.py #cfg=trainer_config.cnn.py #cfg=trainer_config.lstm.py +#cfg=trainer_config.bidi-lstm.py +#cfg=trainer_config.db-lstm.py paddle train \ --config=$cfg \ --save_dir=./output \ diff --git a/demo/quick_start/trainer_config.bidi-lstm.py b/demo/quick_start/trainer_config.bidi-lstm.py new file mode 100644 index 00000000000000..3be3d373422714 --- /dev/null +++ b/demo/quick_start/trainer_config.bidi-lstm.py @@ -0,0 +1,62 @@ +# edit-mode: -*- python -*- + +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +dict_file = "./data/dict.txt" +word_dict = dict() +with open(dict_file, 'r') as f: + for i, line in enumerate(f): + w = line.strip().split()[0] + word_dict[w] = i + +is_predict = get_config_arg('is_predict', bool, False) +trn = 'data/train.list' if not is_predict else None +tst = 'data/test.list' if not is_predict else 'data/pred.list' +process = 'process' if not is_predict else 'process_predict' +define_py_data_sources2(train_list=trn, + test_list=tst, + module="dataprovider_emb", + obj=process, + args={"dictionary": word_dict}) + +batch_size = 128 if not is_predict else 1 +settings( + batch_size=batch_size, + learning_rate=2e-3, + learning_method=AdamOptimizer(), + regularization=L2Regularization(8e-4), + gradient_clipping_threshold=25 +) + +bias_attr = ParamAttr(initial_std=0.,l2_rate=0.) +data = data_layer(name="word", size=len(word_dict)) +emb = embedding_layer(input=data, size=128) + +bi_lstm = bidirectional_lstm(input=emb, size=128) +dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) + +output = fc_layer(input=dropout, size=2, + bias_attr=bias_attr, + act=SoftmaxActivation()) + +if is_predict: + maxid = maxid_layer(output) + outputs([maxid, output]) +else: + label = data_layer(name="label", size=2) + cls = classification_cost(input=output, label=label) + outputs(cls) diff --git a/demo/quick_start/trainer_config.db-lstm.py b/demo/quick_start/trainer_config.db-lstm.py new file mode 100644 index 00000000000000..b35bdf5a61b473 --- /dev/null +++ b/demo/quick_start/trainer_config.db-lstm.py @@ -0,0 +1,73 @@ +# edit-mode: -*- python -*- + +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +dict_file = "./data/dict.txt" +word_dict = dict() +with open(dict_file, 'r') as f: + for i, line in enumerate(f): + w = line.strip().split()[0] + word_dict[w] = i + +is_predict = get_config_arg('is_predict', bool, False) +trn = 'data/train.list' if not is_predict else None +tst = 'data/test.list' if not is_predict else 'data/pred.list' +process = 'process' if not is_predict else 'process_predict' +define_py_data_sources2(train_list=trn, + test_list=tst, + module="dataprovider_emb", + obj=process, + args={"dictionary": word_dict}) + +batch_size = 128 if not is_predict else 1 +settings( + batch_size=batch_size, + learning_rate=2e-3, + learning_method=AdamOptimizer(), + regularization=L2Regularization(8e-4), + gradient_clipping_threshold=25 +) + +bias_attr = ParamAttr(initial_std=0.,l2_rate=0.) + +data = data_layer(name="word", size=len(word_dict)) +emb = embedding_layer(input=data, size=128) + +hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)]) +lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1)) + +input_layers = [hidden_0, lstm_0] + +for i in range(1,8): + fc = fc_layer(input=input_layers, size=128) + lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1), + reverse=(i % 2) == 1,) + input_layers = [fc, lstm] + +lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling()) + +output = fc_layer(input=lstm_last, size=2, + bias_attr=bias_attr, + act=SoftmaxActivation()) + +if is_predict: + maxid = maxid_layer(output) + outputs([maxid, output]) +else: + label = data_layer(name="label", size=2) + cls = classification_cost(input=output, label=label) + outputs(cls) diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index 2b0c3f34648b05..edd6ad3f739b6c 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf, encoded_vector = concat_layer(input=[src_forward, src_backward]) with mixed_layer(size=decoder_size) as encoded_proj: - encoded_proj += full_matrix_projection(encoded_vector) + encoded_proj += full_matrix_projection(input=encoded_vector) backward_first = first_seq(input=src_backward) with mixed_layer(size=decoder_size, act=TanhActivation(), ) as decoder_boot: - decoder_boot += full_matrix_projection(backward_first) + decoder_boot += full_matrix_projection(input=backward_first) def gru_decoder_with_attention(enc_vec, enc_proj, current_word): decoder_mem = memory(name='gru_decoder', @@ -113,8 +113,8 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): decoder_state=decoder_mem, ) with mixed_layer(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += full_matrix_projection(context) - decoder_inputs += full_matrix_projection(current_word) + decoder_inputs += full_matrix_projection(input=context) + decoder_inputs += full_matrix_projection(input=current_word) gru_step = gru_step_layer(name='gru_decoder', input=decoder_inputs, diff --git a/demo/sequence_tagging/data/get_data.sh b/demo/sequence_tagging/data/get_data.sh new file mode 100755 index 00000000000000..e579d6c46ce5ed --- /dev/null +++ b/demo/sequence_tagging/data/get_data.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd $DIR + +wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz +wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz diff --git a/demo/sequence_tagging/data/test.list b/demo/sequence_tagging/data/test.list new file mode 100644 index 00000000000000..073c0a0c9063ac --- /dev/null +++ b/demo/sequence_tagging/data/test.list @@ -0,0 +1 @@ +data/test.txt.gz diff --git a/demo/sequence_tagging/data/train.list b/demo/sequence_tagging/data/train.list new file mode 100644 index 00000000000000..43c24d5f6484a9 --- /dev/null +++ b/demo/sequence_tagging/data/train.list @@ -0,0 +1 @@ +data/train.txt.gz diff --git a/demo/sequence_tagging/dataprovider.py b/demo/sequence_tagging/dataprovider.py new file mode 100644 index 00000000000000..6f412d6834be6d --- /dev/null +++ b/demo/sequence_tagging/dataprovider.py @@ -0,0 +1,258 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer.PyDataProvider2 import * +import gzip +import logging + +logging.basicConfig( + format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', +) +logger = logging.getLogger('paddle') +logger.setLevel(logging.INFO) + +OOV_POLICY_IGNORE = 0 +OOV_POLICY_USE = 1 +OOV_POLICY_ERROR = 2 + +num_original_columns = 3 + +# Feature combination patterns. +# [[-1,0], [0,0]] means previous token at column 0 and current token at +# column 0 are combined as one feature. +patterns = [ + [[-2,0]], + [[-1,0]], + [[0,0]], + [[1,0]], + [[2,0]], + + [[-1,0], [0,0]], + [[0,0], [1,0]], + + [[-2,1]], + [[-1,1]], + [[0,1]], + [[1,1]], + [[2,1]], + [[-2,1], [-1,1]], + [[-1,1], [0,1]], + [[0,1], [1,1]], + [[1,1], [2,1]], + + [[-2,1], [-1,1], [0,1]], + [[-1,1], [0,1], [1,1]], + [[0,1], [1,1], [2,1]], +] + +dict_label = { + 'B-ADJP': 0, + 'I-ADJP': 1, + 'B-ADVP': 2, + 'I-ADVP': 3, + 'B-CONJP': 4, + 'I-CONJP': 5, + 'B-INTJ': 6, + 'I-INTJ': 7, + 'B-LST': 8, + 'I-LST': 9, + 'B-NP': 10, + 'I-NP': 11, + 'B-PP': 12, + 'I-PP': 13, + 'B-PRT': 14, + 'I-PRT': 15, + 'B-SBAR': 16, + 'I-SBAR': 17, + 'B-UCP': 18, + 'I-UCP': 19, + 'B-VP': 20, + 'I-VP': 21, + 'O': 22 +} + +def make_features(sequence): + length = len(sequence) + num_features = len(sequence[0]) + def get_features(pos): + if pos < 0: + return ['#B%s' % -pos] * num_features + if pos >= length: + return ['#E%s' % (pos - length + 1)] * num_features + return sequence[pos] + + for i in xrange(length): + for pattern in patterns: + fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern]) + sequence[i].append(fname) + +''' +Source file format: +Each line is for one timestep. The features are separated by space. +An empty line indicates end of a sequence. + +cutoff: a list of numbers. If count of a feature is smaller than this, + it will be ignored. +if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of +i-th column. + +return a list of dict for each column +''' +def create_dictionaries(filename, cutoff, oov_policy): + def add_to_dict(sequence, dicts): + num_features = len(dicts) + for features in sequence: + l = len(features) + assert l == num_features, "Wrong number of features " + line + for i in xrange(l): + if features[i] in dicts[i]: + dicts[i][features[i]] += 1 + else: + dicts[i][features[i]] = 1 + + num_features = len(cutoff) + dicts = [] + for i in xrange(num_features): + dicts.append(dict()) + + f = gzip.open(filename, 'rb') + + sequence = [] + + for line in f: + line = line.strip() + if not line: + make_features(sequence) + add_to_dict(sequence, dicts) + sequence = [] + continue + features = line.split(' ') + sequence.append(features) + + + for i in xrange(num_features): + dct = dicts[i] + n = 1 if oov_policy[i] == OOV_POLICY_USE else 0 + todo = [] + for k, v in dct.iteritems(): + if v < cutoff[i]: + todo.append(k) + else: + dct[k] = n + n += 1 + + if oov_policy[i] == OOV_POLICY_USE: + # placeholder so that len(dct) will be the number of features + # including OOV + dct['#OOV#'] = 0 + + logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo))) + for k in todo: + del dct[k] + + f.close() + return dicts + + +def initializer(settings, **xargs): + cutoff = [3, 1, 0] + cutoff += [3] * len(patterns) + oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR] + oov_policy += [OOV_POLICY_IGNORE] * len(patterns) + dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy) + dicts[2] = dict_label + settings.dicts = dicts + settings.oov_policy = oov_policy + input_types = [] + num_features = len(dicts) + for i in xrange(num_original_columns): + input_types.append(integer_sequence(len(dicts[i]))) + logger.info("slot %s size=%s" % (i, len(dicts[i]))) + if patterns: + dim = 0 + for i in xrange(num_original_columns, num_features): + dim += len(dicts[i]) + input_types.append(sparse_binary_vector_sequence(dim)) + logger.info("feature size=%s" % dim) + settings.input_types = input_types + +''' +if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not +existed in dicts[i] will be assigned to id 0. +if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist +in dicts[i]. +''' +@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) +def process(settings, filename): + input_file = filename + dicts = settings.dicts + oov_policy = settings.oov_policy + + def gen_sample(sequence): + num_features = len(dicts) + sample = [list() for i in xrange(num_original_columns)] + if patterns: + sample.append([]) + for features in sequence: + assert len(features) == num_features, \ + "Wrong number of features: " + line + for i in xrange(num_original_columns): + id = dicts[i].get(features[i], -1) + if id != -1: + sample[i].append(id) + elif oov_policy[i] == OOV_POLICY_IGNORE: + sample[i].append(0xffffffff) + elif oov_policy[i] == OOV_POLICY_ERROR: + logger.fatal("Unknown token: %s" % features[i]) + else: + sample[i].append(0) + + if patterns: + dim = 0 + vec = [] + for i in xrange(num_original_columns, num_features): + id = dicts[i].get(features[i], -1) + if id != -1: + vec.append(dim + id) + elif oov_policy[i] == OOV_POLICY_IGNORE: + pass + elif oov_policy[i] == OOV_POLICY_ERROR: + logger.fatal("Unknown token: %s" % features[i]) + else: + vec.ids.append(dim + 0) + + dim += len(dicts[i]) + sample[-1].append(vec) + return sample + + num_features = len(dicts) + f = gzip.open(input_file, 'rb') + + num_sequences = 0 + sequence = [] + for line in f: + line = line.strip() + if not line: + make_features(sequence) + yield gen_sample(sequence) + sequence = [] + num_sequences += 1 + continue + features = line.split(' ') + sequence.append(features) + + f.close() + + logger.info("num_sequences=%s" % num_sequences) + diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py new file mode 100644 index 00000000000000..2bd1a20bc52fc5 --- /dev/null +++ b/demo/sequence_tagging/linear_crf.py @@ -0,0 +1,84 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +import math + +define_py_data_sources2(train_list="data/train.list", + test_list="data/test.list", + module="dataprovider", + obj="process") + + +batch_size = 1 +settings( + learning_method=MomentumOptimizer(), + batch_size=batch_size, + regularization=L2Regularization(batch_size * 1e-4), + average_window=0.5, + learning_rate=1e-1, + learning_rate_decay_a=1e-5, + learning_rate_decay_b=0.25, +) + +num_label_types=23 + +def get_simd_size(size): + return int(math.ceil(float(size) / 8)) * 8 + +# Currently, in order to use sparse_update=True, +# the size has to be aligned. +num_label_types = get_simd_size(num_label_types) + +features = data_layer(name="features", size=76328) +word = data_layer(name="word", size=6778) +pos = data_layer(name="pos", size=44) +chunk = data_layer(name="chunk", + size=num_label_types) + +crf_input = fc_layer( + input=features, + size=num_label_types, + act=LinearActivation(), + bias_attr=False, + param_attr=ParamAttr(initial_std=0, sparse_update=True)) + +crf=crf_layer( + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw", initial_std=0), +) + +crf_decoding=crf_decoding_layer( + size=num_label_types, + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw"), +) + +sum_evaluator( + name="error", + input=crf_decoding, +) + +chunk_evaluator( + name="chunk_f1", + input =[crf_decoding, chunk], + chunk_scheme="IOB", + num_chunk_types=11, +) + +inputs(word, pos, chunk, features) +outputs(crf) diff --git a/demo/sequence_tagging/readme.md b/demo/sequence_tagging/readme.md new file mode 100644 index 00000000000000..2e17fffb83c532 --- /dev/null +++ b/demo/sequence_tagging/readme.md @@ -0,0 +1,45 @@ +# Sequence Tagging + +This demo is a sequence model for assigning tags to each token in a sentence. The task is described at CONLL2000 Text Chunking task. + +## Download data +```bash +cd demo/sequence_tagging +./data/get_data.sh +``` + +## Train model +```bash +cd demo/sequence_tagging +./train.sh +``` + +## Model description + +We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at leon.bottou.org/projects/sgd. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py). +
+ + + + + + + + + + + + + + + + + + + + + + +
Model nameNumber of parametersF1 score
linear_crf 1.8M 0.937
rnn_crf 960K 0.941
+
+
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py new file mode 100644 index 00000000000000..fb157bf3ea7193 --- /dev/null +++ b/demo/sequence_tagging/rnn_crf.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +import math + +define_py_data_sources2(train_list="data/train.list", + test_list="data/test.list", + module="dataprovider", + obj="process") + +batch_size = 16 +settings( + learning_method=MomentumOptimizer(), + batch_size=batch_size, + regularization=L2Regularization(batch_size * 1e-5), + average_window=0.5, + learning_rate = 2e-3, + learning_rate_decay_a = 5e-7, + learning_rate_decay_b = 0.5, +) + +word_dim=128 +hidden_dim = 128 +with_rnn = True + +initial_std=1/math.sqrt(hidden_dim) +param_attr=ParamAttr(initial_std=initial_std) +cpu_layer_attr=ExtraLayerAttribute(device=-1) + +default_device(0) + +num_label_types=23 + +features = data_layer(name="features", size=76328) +word = data_layer(name="word", size=6778) +pos = data_layer(name="pos", size=44) +chunk = data_layer(name="chunk", + size=num_label_types, + layer_attr=cpu_layer_attr) + +emb = embedding_layer( + input=word, size=word_dim, param_attr=ParamAttr(initial_std=0)) + +hidden1 = mixed_layer( + size=hidden_dim, + act=STanhActivation(), + bias_attr=True, + input=[full_matrix_projection(emb), + table_projection(pos, param_attr=param_attr)] +) + +if with_rnn: + rnn1 = recurrent_layer( + act=ReluActivation(), + bias_attr=True, + input=hidden1, + param_attr=ParamAttr(initial_std=0), + ) + +hidden2 = mixed_layer( + size=hidden_dim, + act=STanhActivation(), + bias_attr=True, + input=[full_matrix_projection(hidden1) + ] + ([ + full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0)) + ] if with_rnn else []), +) + +if with_rnn: + rnn2=recurrent_layer( + reverse=True, + act=ReluActivation(), + bias_attr=True, + input=hidden2, + param_attr=ParamAttr(initial_std=0), + ) + +crf_input = mixed_layer( + size=num_label_types, + bias_attr=False, + input=[ + full_matrix_projection(hidden2), + ] + ([ + full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0)) + ] if with_rnn else []), +) + +crf = crf_layer( + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw", initial_std=0), + layer_attr=cpu_layer_attr, +) + +crf_decoding = crf_decoding_layer( + size=num_label_types, + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw"), + layer_attr=cpu_layer_attr, +) + +sum_evaluator( + name="error", + input=crf_decoding, +) + +chunk_evaluator( + name="chunk_f1", + input =[crf_decoding, chunk], + chunk_scheme="IOB", + num_chunk_types=11, +) + +inputs(word, pos, chunk, features) +outputs(crf) diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh new file mode 100755 index 00000000000000..9a706b98d86861 --- /dev/null +++ b/demo/sequence_tagging/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +paddle train \ + --config rnn_crf.py \ + --parallel_nn=1 \ + --use_gpu=1 \ + --dot_period=10 \ + --log_period=1000 \ + --test_period=0 \ + --num_passes=10 diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh new file mode 100755 index 00000000000000..597b5afea9c63a --- /dev/null +++ b/demo/sequence_tagging/train_linear.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +paddle train \ + --config linear_crf.py \ + --use_gpu=0 \ + --dot_period=100 \ + --log_period=10000 \ + --test_period=0 \ + --num_passes=10 diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md index 06fcff61720755..bbdbb4d4227d0b 100644 --- a/doc/build/contribute_to_paddle.md +++ b/doc/build/contribute_to_paddle.md @@ -99,3 +99,7 @@ git pull --rebase upstream HEAD git push -f origin HEAD ``` Now your Pull Request is updated with the latest version. + +## Revise your pull request + +When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request. diff --git a/doc/build/docker_install.rst b/doc/build/docker_install.rst index 542b9bac27afb8..e95de35f4da35f 100644 --- a/doc/build/docker_install.rst +++ b/doc/build/docker_install.rst @@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme .. code-block:: bash - export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}" + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md index ee3fa2a2166f49..e7d74512292c89 100644 --- a/doc/demo/quick_start/index_en.md +++ b/doc/demo/quick_start/index_en.md @@ -134,7 +134,7 @@ def process(settings, file_name): You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies: - The path of the training and testing data (`data/train.list`, `data/test.list`). -- The location of the data provider file (`dataprovider_pow`). +- The location of the data provider file (`dataprovider_bow`). - The function to call to get data. (`process`). - Additional arguments or data. Here it passes the path of word dictionary. diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst index 01443466105b5b..ab27c3bd6e8ad7 100644 --- a/doc/ui/api/trainer_config_helpers/layers.rst +++ b/doc/ui/api/trainer_config_helpers/layers.rst @@ -73,6 +73,12 @@ img_pool_layer :members: img_pool_layer :noindex: +maxout_layer +------------ +.. automodule:: paddle.trainer_config_helpers.layers + :members: maxout_layer + :noindex: + Norm Layer ========== @@ -130,6 +136,12 @@ gru_step_layer Recurrent Layer Group ===================== +memory +------ +.. automodule:: paddle.trainer_config_helpers.layers + :members: memory + :noindex: + recurrent_group --------------- .. automodule:: paddle.trainer_config_helpers.layers @@ -377,6 +389,12 @@ ctc_layer :members: ctc_layer :noindex: +nce_layer +----------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: nce_layer + :noindex: + hsigmoid --------- .. automodule:: paddle.trainer_config_helpers.layers diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.md new file mode 100644 index 00000000000000..5282bbbcb82d00 --- /dev/null +++ b/doc_cn/algorithm/rnn/hierarchical-layer.md @@ -0,0 +1,66 @@ +# 支持双层序列作为输入的Layer + +## 概述 + +在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。 + +双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。 + +我们可以按照如下层次定义非序列,单层序列,以及双层序列。 + ++ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型 ++ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息 ++ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列 + + +在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。 +## pooling_layer + +pooling_layer的使用示例如下,详细见配置API。 +```python +seq_pool = pooling_layer(input=layer, + pooling_type=AvgPooling(), + agg_level=AggregateLevel.EACH_SEQUENCE) +``` +- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。 +- `agg_level=AggregateLevel.TIMESTEP`时(默认值): + - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 + - 输入:一个双层序列,或一个单层序列 + - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) +- `agg_level=AggregateLevel.EACH_SEQUENCE`时: + - 作用:一个双层序列经过运算变成一个单层序列 + - 输入:必须是一个双层序列 + - 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值) + +## last_seq 和 first_seq + +last_seq的使用示例如下(first_seq类似),详细见配置API。 +```python +last = last_seq(input=layer, + agg_level=AggregateLevel.EACH_SEQUENCE) +``` +- `agg_level=AggregateLevel.TIMESTEP`时(默认值): + - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 + - 输入:一个双层序列或一个单层序列 + - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 +- `agg_level=AggregateLevel.EACH_SEQUENCE`时: + - 作用:一个双层序列经过运算变成一个单层序列 + - 输入:必须是一个双层序列 + - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 + +## expand_layer + +expand_layer的使用示例如下,详细见配置API。 +```python +expand = expand_layer(input=layer1, + expand_as=layer2, + expand_level=ExpandLevel.FROM_TIMESTEP) +``` +- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值): + - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 + - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 + - 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝 +- `expand_level=ExpandLevel.FROM_SEQUENCE`时: + - 作用:一个单层序列经过运算扩展成一个双层序列 + - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息 + - 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。 \ No newline at end of file diff --git a/doc_cn/algorithm/rnn/hierarchical-rnn.md b/doc_cn/algorithm/rnn/hierarchical-rnn.md new file mode 100644 index 00000000000000..4a85cf336146ef --- /dev/null +++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md @@ -0,0 +1,403 @@ +# 双层RNN配置与示例 + +我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中,通过多组语义相同的单双层RNN配置,讲解如何使用双层RNN。 + +## 示例1:双进双出,subseq间无memory + +配置:单层RNN(`sequence_layer_group`)和双层RNN(`sequence_nest_layer_group`),语义完全相同。 + +### 读取双层序列的方法 + +首先,我们看一下单双层序列的不同数据组织形式(您也可以采用别的组织形式): + +- 单层序列的数据(`Sequence/tour_train_wdseg`)如下,一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。 + +```text +2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。 +2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 * +2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。 +2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。 +2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 . +2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店 +2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 ! +2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。 +2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 * +2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格 +``` + +- 双层序列的数据(`Sequence/tour_train_wdseg.nest`)如下,一共有4个样本。样本间用空行分开,代表不同的双层序列,序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。 + +```text +2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。 +2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 * + +2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。 +2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。 +2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 . + +2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店 +2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 ! + +2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。 +2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 * +2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格 +``` + +其次,我们看一下单双层序列的不同dataprovider(见`sequenceGen.py`): + +- 单层序列的dataprovider如下: + - word_slot是integer_value_sequence类型,代表单层序列。 + - label是integer_value类型,代表一个向量。 + +```python +def hook(settings, dict_file, **kwargs): + settings.word_dict = dict_file + settings.input_types = [integer_value_sequence(len(settings.word_dict)), + integer_value(3)] + +@provider(init_hook=hook) +def process(settings, file_name): + with open(file_name, 'r') as fdata: + for line in fdata: + label, comment = line.strip().split('\t') + label = int(''.join(label.split())) + words = comment.split() + word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] + yield word_slot, label +``` + +- 双层序列的dataprovider如下: + - word_slot是integer_value_sub_sequence类型,代表双层序列。 + - label是integer_value_sequence类型,代表单层序列,即一个子句一个label。注意:也可以为integer_value类型,代表一个向量,即一个句子一个label。通常根据任务需求进行不同设置。 + - 关于dataprovider中input_types的详细用法,参见PyDataProvider2。 + +```python +def hook2(settings, dict_file, **kwargs): + settings.word_dict = dict_file + settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)), + integer_value_sequence(3)] + +@provider(init_hook=hook2) +def process2(settings, file_name): + with open(file_name) as fdata: + label_list = [] + word_slot_list = [] + for line in fdata: + if (len(line)) > 1: + label,comment = line.strip().split('\t') + label = int(''.join(label.split())) + words = comment.split() + word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] + label_list.append(label) + word_slot_list.append(word_slot) + else: + yield word_slot_list, label_list + label_list = [] + word_slot_list = [] +``` + +### 模型中的配置 + +首先,我们看一下单层序列的配置(见`sequence_layer_group.conf`)。注意:batchsize=5表示一次过5句单层序列,因此2个batch就可以完成1个pass。 + +```python +settings(batch_size=5) + +data = data_layer(name="word", size=dict_dim) + +emb = embedding_layer(input=data, size=word_dim) + +# (lstm_input + lstm) is equal to lstmemory +with mixed_layer(size=hidden_dim*4) as lstm_input: + lstm_input += full_matrix_projection(input=emb) + +lstm = lstmemory_group(input=lstm_input, + size=hidden_dim, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation(), + lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) + +lstm_last = last_seq(input=lstm) + +with mixed_layer(size=label_dim, + act=SoftmaxActivation(), + bias_attr=True) as output: + output += full_matrix_projection(input=lstm_last) + +outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) + +``` +其次,我们看一下语义相同的双层序列配置(见`sequence_nest_layer_group.conf`),并对其详细分析: + +- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知,2句双层序列和5句单层序列的数据完全一样。 +- data_layer和embedding_layer不关心数据是否是序列格式,因此两个配置在这两层上的输出是一样的。 +- lstmemory: + - 单层序列过了一个mixed_layer和lstmemory_group。 + - 双层序列在同样的mixed_layer和lstmemory_group外,直接加了一层group。由于这个外层group里面没有memory,表示subseq间不存在联系,即起到的作用仅仅是把双层seq拆成单层,因此双层序列过完lstmemory的输出和单层的一样。 +- last_seq: + - 单层序列直接取了最后一个元素 + - 双层序列首先(last_seq层)取了每个subseq的最后一个元素,将其拼接成一个新的单层序列;接着(expand_layer层)将其扩展成一个新的双层序列,其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量;最后(average_layer层)取了每个subseq的平均值。 + - 分析得出:第一个last_seq后,每个subseq的最后一个元素就等于单层序列的最后一个元素,而expand_layer和average_layer后,依然保持每个subseq最后一个元素的值不变(这两层仅是为了展示它们的用法,实际中并不需要)。因此单双层序列的输出是一样旳。 + +```python +settings(batch_size=2) + +data = data_layer(name="word", size=dict_dim) + +emb_group = embedding_layer(input=data, size=word_dim) + +# (lstm_input + lstm) is equal to lstmemory +def lstm_group(lstm_group_input): + with mixed_layer(size=hidden_dim*4) as group_input: + group_input += full_matrix_projection(input=lstm_group_input) + + lstm_output = lstmemory_group(input=group_input, + name="lstm_group", + size=hidden_dim, + act=TanhActivation(), + gate_act=SigmoidActivation(), + state_act=TanhActivation(), + lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) + return lstm_output + +lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group), + step=lstm_group, + name="lstm_nest_group") +# hasSubseq ->(seqlastins) seq +lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE) + +# seq ->(expand) hasSubseq +lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE) + +# hasSubseq ->(average) seq +lstm_average = pooling_layer(input=lstm_expand, + pooling_type=AvgPooling(), + agg_level=AggregateLevel.EACH_SEQUENCE) + +with mixed_layer(size=label_dim, + act=SoftmaxActivation(), + bias_attr=True) as output: + output += full_matrix_projection(input=lstm_average) + +outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) +``` +## 示例2:双进双出,subseq间有memory + +配置:单层RNN(`sequence_rnn.conf`),双层RNN(`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`),语义完全相同。 + +### 读取双层序列的方法 + +我们看一下单双层序列的不同数据组织形式和dataprovider(见`rnn_data_provider.py`) +```python +data = [ + [[[1, 3, 2], [4, 5, 2]], 0], + [[[0, 2], [2, 5], [0, 1, 2]], 1], +] + +@provider(input_types=[integer_value_sub_sequence(10), + integer_value(3)]) +def process_subseq(settings, file_name): + for d in data: + yield d + +@provider(input_types=[integer_value_sequence(10), + integer_value(3)]) +def process_seq(settings, file_name): + for d in data: + seq = [] +``` +- 单层序列:有两句,分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。 +- 双层序列:有两句,分别为[[1,3,2],[4,5,2]](2个子句)和[[0,2],[2,5],[0,1,2]](3个子句)。 +- 单双层序列的label都分别是0和1 + +### 模型中的配置 + +我们选取单双层序列配置中的不同部分,来对比分析两者语义相同的原因。 + +- 单层序列:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。 + +```python +def step(y): + mem = memory(name="rnn_state", size=hidden_dim) + return fc_layer(input=[y, mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name="rnn_state") + +out = recurrent_group(step=step, input=emb) +``` +- 双层序列,外层memory是一个元素: + - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。 + - 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。 + +```python +def outer_step(x): + outer_mem = memory(name="outer_rnn_state", size=hidden_dim) + def inner_step(y): + inner_mem = memory(name="inner_rnn_state", + size=hidden_dim, + boot_layer=outer_mem) + return fc_layer(input=[y, inner_mem], + size=hidden_dim, + act=TanhActivation(), + bias_attr=True, + name="inner_rnn_state") + + inner_rnn_output = recurrent_group( + step=inner_step, + input=x) + last = last_seq(input=inner_rnn_output, name="outer_rnn_state") + + return inner_rnn_output + +out = recurrent_group(step=outer_step, input=SubsequenceInput(emb)) +``` +- 双层序列,外层memory是单层序列: + - 由于外层每个时间步返回的是一个子句,这些子句的长度往往不等长。因此当外层有is_seq=True的memory时,内层是**无法直接使用**它的,即内层memory的boot_layer不能链接外层的这个memory。 + - 如果内层memory想**间接使用**这个外层memory,只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下,外层memory必须有boot_layer,否则在第0个时间步时,由于外层memory没有任何seq信息,因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。 + +## 示例3:双进双出,输入不等长 + +**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input,用targetInlink表示。参考配置:单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`),双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`) + +### 读取双层序列的方法 + +我们看一下单双层序列的数据组织形式和dataprovider(见`rnn_data_provider.py`) +```python +data2 = [ + [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0], + [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1], +] + +@provider(input_types=[integer_value_sub_sequence(10), + integer_value_sub_sequence(10), + integer_value(2)], + should_shuffle=False) +def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider + for d in data2: + yield d + + +@provider(input_types=[integer_value_sequence(10), + integer_value_sequence(10), + integer_value(2)], + should_shuffle=False) +def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider + for d in data2: + words1=reduce(lambda x,y: x+y, d[0]) + words2=reduce(lambda x,y: x+y, d[1]) + yield words1, words2, d[2] +``` + +data2 中有两个样本,每个样本有两个特征, 记fea1, fea2。 + +- 单层序列:两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]] +- 双层序列:两个样本分别为 + - **样本1**:[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句,fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]] + - **样本2**:[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句, fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。
+ - **注意**:每个样本中,各特征的子句数目需要相等。这里说的“双进双出,输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本,时刻i=2, fea1[2]=[4, 5, 2],fea2[2]=[3, 1],3≠2。 +- 单双层序列中,两个样本的label都分别是0和1 + +### 模型中的配置 + +单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`)和双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`)两个模型配置达到的效果完全一样,区别只在于输入为单层还是双层序列,现在我们来看它们内部分别是如何实现的。 + +- 单层序列: + - 过了一个简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全连接,功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里,两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列,最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。 + - 注意到这里recurrent_group输入的每个样本中,fea1和fea2的长度都分别相等,这并非偶然,而是因为recurrent_group要求输入为单层序列时,所有输入的长度都必须相等。 + +```python +def step(x1, x2): + def calrnn(y): + mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim) + out = fc_layer(input = [y, mem], + size = hidden_dim, + act = TanhActivation(), + bias_attr = True, + name = 'rnn_state_' + y.name) + return out + + encoder1 = calrnn(x1) + encoder2 = calrnn(x2) + return [encoder1, encoder2] + +encoder1_rep, encoder2_rep = recurrent_group( + name="stepout", + step=step, + input=[emb1, emb2]) + +encoder1_last = last_seq(input = encoder1_rep) +encoder1_expandlast = expand_layer(input = encoder1_last, + expand_as = encoder2_rep) +context = mixed_layer(input = [identity_projection(encoder1_expandlast), + identity_projection(encoder2_rep)], + size = hidden_dim) +``` +- 双层序列: + - 双层RNN中,对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2),其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是,此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。 + - 函数`outer_step`中可以分别处理这两个特征,但我们需要用targetInlink指定recurrent_group的输出的格式(各子句长度)只能和其中一个保持一致,如这里选择了和emb2的长度一致。 + - 最后,依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。 + +```python +def outer_step(x1, x2): + outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim) + outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim) + def inner_step1(y): + inner_mem = memory(name = 'inner_rnn_state_' + y.name, + size = hidden_dim, + boot_layer = outer_mem1) + out = fc_layer(input = [y, inner_mem], + size = hidden_dim, + act = TanhActivation(), + bias_attr = True, + name = 'inner_rnn_state_' + y.name) + return out + + def inner_step2(y): + inner_mem = memory(name = 'inner_rnn_state_' + y.name, + size = hidden_dim, + boot_layer = outer_mem2) + out = fc_layer(input = [y, inner_mem], + size = hidden_dim, + act = TanhActivation(), + bias_attr = True, + name = 'inner_rnn_state_' + y.name) + return out + + encoder1 = recurrent_group( + step = inner_step1, + name = 'inner1', + input = x1) + + encoder2 = recurrent_group( + step = inner_step2, + name = 'inner2', + input = x2) + + sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1') + sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2') + + encoder1_expand = expand_layer(input = sentence_last_state1, + expand_as = encoder2) + + return [encoder1_expand, encoder2] + +encoder1_rep, encoder2_rep = recurrent_group( + name="outer", + step=outer_step, + input=[SubsequenceInput(emb1), SubsequenceInput(emb2)], + targetInlink=emb2) + +encoder1_last = last_seq(input = encoder1_rep) +encoder1_expandlast = expand_layer(input = encoder1_last, + expand_as = encoder2_rep) +context = mixed_layer(input = [identity_projection(encoder1_expandlast), + identity_projection(encoder2_rep)], + size = hidden_dim) +``` + +## 示例4:beam_search的生成 + +TBD \ No newline at end of file diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc_cn/algorithm/rnn/rnn-tutorial.md new file mode 100644 index 00000000000000..7a553054c80392 --- /dev/null +++ b/doc_cn/algorithm/rnn/rnn-tutorial.md @@ -0,0 +1,96 @@ +# Recurrent Group教程 + +## 概述 + +序列数据是自然语言处理任务面对的一种主要输入数据类型。 + +一句话是由词语构成的序列,多句话进一步构成了段落。因此,段落可以看作是一个嵌套的双层的序列,这个序列的每个元素又是一个序列。 + +双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式,帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入,我们可以设计搭建一个灵活的、层次化的RNN,分别从词语和句子级别编码输入数据,同时也能够引入更加复杂的记忆机制,更好地完成一些复杂的语言理解任务。 + +在PaddlePaddle中,`recurrent_group`是一种任意复杂的RNN单元,用户只需定义RNN在一个时间步内完成的计算,PaddlePaddle负责完成信息和误差在时间序列上的传播。 + +更进一步,`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算,最终实现一个层次化的复杂RNN。 + +目前,在PaddlePaddle中,能够对双向序列进行处理的有`recurrent_group`和部分Layer,具体可参考文档:支持双层序列作为输入的Layer。 + +## 相关概念 + +### 基本原理 +`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算,PaddlePaddle负责完成信息和梯度在时间序列上的传播。 + +PaddlePaddle中,`recurrent_group`的一个简单调用如下: + +``` python +recurrent_group(step, input, reverse) +``` +- step:一个可调用的函数,定义一个时间步之内RNN单元完成的计算 +- input:输入,必须是一个单层序列,或者一个双层序列 +- reverse:是否以逆序处理输入序列 + +使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer,完成任意的运算逻辑。`recurrent_group` 的输入(即input)会成为step函数的输入,由于step 函数只关注于RNN一个时间步之内的计算,在这里`recurrent_group`替我们完成了原始输入数据的拆分。 + +### 输入 +`recurrent_group`处理的输入序列主要分为以下三种类型: + +- **数据输入**:一个双层序列进入`recurrent_group`会被拆解为一个单层序列,一个单层序列进入`recurrent_group`会被拆解为非序列,然后交给step函数,这一过程对用户是完全透明的。可以有以下两种:1)通过data_layer拿到的用户输入;2)其它layer的输出。 + +- **只读Memory输入**:`StaticInput` 定义了一个只读的Memory,由`StaticInput`指定的输入不会被`recurrent_group`拆解,`recurrent_group` 循环展开的每个时间步总是能够引用所有输入,可以是一个非序列,或者一个单层序列。 + +- **序列生成任务的输入**:`GeneratedInput`只用于在序列生成任务中指定输入数据。 + +### 输入示例 + +序列生成任务大多遵循encoder-decoer架构,encoder和decoder可以是能够处理序列的任意神经网络单元,而RNN是最流行的选择。 + +给定encoder输出和当前词,decoder每次预测产生下一个最可能的词语。在这种结构中,decoder接受两个输入: + +- 要生成的目标序列:是decoder的数据输入,也是decoder循环展开的依据,`recurrent_group`会对这类输入进行拆解。 + +- encoder输出,可以是一个非序列,或者一个单层序列:是一个unbounded memory,decoder循环展开的每一个时间步会引用全部结果,不应该被拆解,这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。 + +在序列生成任务中,decoder RNN总是引用上一时刻预测出的词的词向量,作为当前时刻输入。`GeneratedInput`自动完成这一过程。 + +### 输出 +`step`函数必须返回一个或多个Layer的输出,这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中,`recurrent_group` 会将每个时间步的输出拼接,这个过程对用户也是透明的。 + +### memory +memory只能在`recurrent_group`中定义和使用。memory不能独立存在,必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出,因此,可以将memory理解为一个时延操作。 + +可以显示地指定一个layer的输出用于初始化memory。不指定时,memory默认初始化为0。 + +## 双层RNN介绍 +`recurrent_group`帮助我们完成对输入序列的拆分,对输出的合并,以及计算逻辑在序列上的循环展开。 + +利用这种特性,两个嵌套的`recurrent_group`能够处理双层序列,实现词语和句子两个级别的双层RNN结构。 + +- 单层(word-level)RNN:每个状态(state)对应一个词(word)。 +- 双层(sequence-level)RNN:一个双层RNN由多个单层RNN组成,每个单层RNN(即双层RNN的每个状态)对应一个子句(subseq)。 + +为了描述方便,下文以NLP任务为例,将含有子句(subseq)的段落定义为一个双层序列,将含有词语的句子定义为一个单层序列,那么0层序列即为一个词语。 + +## 双层RNN的使用 + +### 训练流程的使用方法 +使用 `recurrent_group`需要遵循以下约定: + +- **单进单出**:输入和输出都是单层序列。 + - 如果有多个输入,不同输入序列含有的词语数必须严格相等。 + - 输出一个单层序列,输出序列的词语数和输入序列一致。 + - memory:在step函数中定义 memory指向一个layer,通过引用memory得到这个layer上一个时刻输出,形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory,每个时间步之内的运算是独立的。 + - boot_layer:memory的初始状态,默认初始状为0,memory的is_seq参数必须为false。 + +- **双进双出**:输入和输出都是双层序列。 + - 如果有多个输入序列,不同输入含有的子句(subseq)数必须严格相等,但子句含有的词语数可以不相等。 + - 输出一个双层序列,子句(subseq)数、子句的单词数和指定的一个输入序列一致,默认为第一个输入。 + - memory:在step函数中定义memory,指向一个layer,通过引用memory得到这个layer上一个时刻的输出,形成recurrent连接。定义在外层`recurrent_group` step函数中的memory,能够记录上一个subseq 的状态,可以是一个单层序列(只作为read-only memory),也可以是一个词语。如果没有定义memory,那么 subseq 之间的运算是独立的。 + - boot_layer:memory 初始状态,可以是一个单层序列(只作为read-only memory)或一个向量。默认不设置,即初始状态为0。 + +- **双进单出**:目前还未支持,会报错"In hierachical RNN, all out links should be from sequences now"。 + + +### 生成流程的使用方法 +使用`beam_search`需要遵循以下约定: + +- 单层RNN:从一个word生成下一个word。 +- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。 \ No newline at end of file diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst index 44aa2a0983f4fd..a5f5fb117e11e8 100644 --- a/doc_cn/build_and_install/install/docker_install.rst +++ b/doc_cn/build_and_install/install/docker_install.rst @@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本 +-----------------+------------------+------------------------+-----------------------+ | GPU | gpu-latest | gpu-devel-latest | gpu-demo-latest | +-----------------+------------------+------------------------+-----------------------+ -| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest | +| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest | +-----------------+------------------+------------------------+-----------------------+ -| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest | +| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest | +-----------------+------------------+------------------------+-----------------------+ 其中,横向包括三个版本,normal,devel和demo。 diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in index 391f7981eab809..93242ace406000 100644 --- a/doc_cn/conf.py.in +++ b/doc_cn/conf.py.in @@ -47,6 +47,7 @@ extensions = [ 'sphinx.ext.autosummary', 'sphinx.ext.mathjax', 'sphinx.ext.napoleon', + 'sphinx.ext.graphviz' ] table_styling_embed_css = True diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst new file mode 100644 index 00000000000000..283607957ce630 --- /dev/null +++ b/doc_cn/faq/index.rst @@ -0,0 +1,169 @@ +#################### +PaddlePaddle常见问题 +#################### + +.. contents:: + +1. 如何减少PaddlePaddle的内存占用 +--------------------------------- + +神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。 +PaddlePaddle的内存占用主要分为如下几个方面\: + +* DataProvider缓冲池内存 (只针对内存) +* 神经元激活内存 (针对内存和显存) +* 参数内存 (针对内存和显存) +* 其他内存杂项 + +这其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等, +这些内存就不考虑如何缩减了。 + +其他的内存的减少方法依次为 + + +减少DataProvider缓冲池内存 +++++++++++++++++++++++++++ + +PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即 + +.. graphviz:: + + digraph { + rankdir=LR; + 数据文件 -> 内存池 -> PaddlePaddle训练 + } + +所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 +个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, +那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为 + +.. literalinclude:: reduce_min_pool_size.py + +这样做可以极大的减少内存占用,并且可能会加速训练过程。 详细文档参考 `这里 +<../ui/data_provider/pydataprovider2.html#provider>`_ 。 + +神经元激活内存 +++++++++++++++ + +神经网络在训练的时候,会对每一个激活暂存一些数据,包括激活,參差等等。 +在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系, +一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含 +的时间步信息成正比。 + +所以,做法可以有两种。他们是 + +* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。 +* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200, + 但是突然有一个10000长的序列,就很容易导致内存超限。特别是在LSTM等RNN中。 + +参数内存 +++++++++ + +PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。 +例如如果使用 :code:`adadelta` 算法,则需要使用参数规模大约5倍的内存。 如果参数保存下来的 +文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。 + +可以考虑使用一些优化算法,例如 :code:`momentum`。 + +2. 如何加速PaddlePaddle的训练速度 +--------------------------------- + +PaddlePaddle是神经网络训练平台,加速PaddlePaddle训练有如下几个方面\: + +* 减少数据载入的耗时 +* 加速训练速度 +* 利用更多的计算资源 + +减少数据载入的耗时 +++++++++++++++++++ + +使用 :code:`pydataprovider`时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 +:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。 + +.. literalinclude:: reduce_min_pool_size.py + +同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。 + + +加速训练速度 +++++++++++++ + +PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True` + +这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\: + +使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\: + +.. literalinclude:: word2vec_dataprovider.py + +这个任务的配置为\: + +.. literalinclude:: word2vec_config.py + +更多关于sparse训练的内容请参考 `sparse训练的文档 `_ + +利用更多的计算资源 +++++++++++++++++++ + +利用更多的计算资源可以分为一下几个方式来进行\: + +* 单机CPU训练 + * 使用多线程训练。设置命令行参数 :code:`trainer_count`,即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4` +* 单机GPU训练 + * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true` + * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练,使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4` +* 多机训练 + * 使用多机训练的方法也比较简单,需要先在每个节点启动 :code:`paddle pserver`,在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址 + * 具体的多机训练方法参考 `多机训练 `_ 文档。 + + +3. 遇到“非法指令”或者是“illegal instruction” +-------------------------------------------- + +paddle在进行计算的时候为了提升计算性能,使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。(另:用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉,请当成是不支持,看下面的解决方案) + +解决办法是\: + +* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_ +* 或者,使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。 + + +4. 如何选择SGD算法的学习率 +-------------------------- + +在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。 + +通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。 + +如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。 + + +5. 如何初始化参数 +----------------- + +默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\: + +* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)` +* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)` + +比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。 + +.. code-block:: python + + hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), + bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0)) + +上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。 + +6. 如何共享参数 +--------------- + +PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是想要共享的参数使用同样的 :code:`ParamAttr` 对象。 + +简单的全连接网络,参数共享的配置示例为\: + +.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py + +这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。 + + diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc_cn/faq/reduce_min_pool_size.py new file mode 100644 index 00000000000000..2811b134b66b1e --- /dev/null +++ b/doc_cn/faq/reduce_min_pool_size.py @@ -0,0 +1,6 @@ +@provider(min_pool_size=0, ...) +def process(settings, filename): + os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before. + with open('%s.shuf' % filename, 'r') as f: + for line in f: + yield get_sample_from_line(line) \ No newline at end of file diff --git a/doc_cn/faq/word2vec_config.py b/doc_cn/faq/word2vec_config.py new file mode 100644 index 00000000000000..e347252476eab6 --- /dev/null +++ b/doc_cn/faq/word2vec_config.py @@ -0,0 +1,8 @@ +... # the settings and define data provider is omitted. +DICT_DIM=3000 # dictionary dimension. +word_ids=data_layer('word_ids', size=DICT_DIM) + +emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True)) +emb_sum = pooling_layer(input=emb, pooling_type=SumPooling()) +predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax()) +outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM))) \ No newline at end of file diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc_cn/faq/word2vec_dataprovider.py new file mode 100644 index 00000000000000..a0a39080cece90 --- /dev/null +++ b/doc_cn/faq/word2vec_dataprovider.py @@ -0,0 +1,8 @@ +DICT_DIM=3000 +@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)]) +def process(settings, filename): + with open(filename) as f: + # yield word ids to predict inner word id + # such as [28, 29, 10, 4], 4 + # It means the sentance is 28, 29, 4, 10, 4. + yield read_next_from_file(f) \ No newline at end of file diff --git a/doc_cn/index.rst b/doc_cn/index.rst index 6cf5588b5b34f5..d2d50fbdb47f27 100644 --- a/doc_cn/index.rst +++ b/doc_cn/index.rst @@ -3,6 +3,7 @@ PaddlePaddle文档 使用指南 -------- + * `快速入门 `_ * `编译与安装 `_ * `用户接口 `_ @@ -16,4 +17,13 @@ PaddlePaddle文档 算法教程 -------- -* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_ + +* `Recurrent Group教程 `_ +* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_ +* `双层RNN示例 `_ +* `支持双层序列作为输入的Layer `_ + +常见问题 +-------- + +* `常见问题 `_ diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py index 4eab5b1fd3b50a..bf13b56372b56a 100644 --- a/doc_cn/ui/data_provider/mnist_provider.dict.py +++ b/doc_cn/ui/data_provider/mnist_provider.dict.py @@ -2,10 +2,10 @@ # Define a py data provider -@provider(input_types=[ - dense_vector(28 * 28), - integer_value(10) -]) +@provider(input_types={ + 'pixel': dense_vector(28 * 28), + 'label': integer_value(10) +}) def process(settings, filename): # settings is not used currently. f = open(filename, 'r') # open one of training file @@ -20,6 +20,6 @@ def process(settings, filename): # settings is not used currently. pixels_float.append(float(each_pixel_str)) # give data to paddle. - yield { "pixel": pixels_float, 'label': int(label) } + yield {"pixel": pixels_float, 'label': int(label)} f.close() # close file diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst index 9e1d8c531f5ba2..80b40084d8f503 100644 --- a/doc_cn/ui/data_provider/pydataprovider2.rst +++ b/doc_cn/ui/data_provider/pydataprovider2.rst @@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数: 是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size * cache 是数据缓存的策略,参考 `cache`_ * init_hook 是初始化时调用的函数,参考 `init_hook`_ -* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以 - 返回一个list或者tuple。如果是false的话,只能够返回list或者tuple * check 设置成true的话,会根据input_types检查数据的合法性。 * check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果 check是false的话,没有作用。 diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh index f7019b27f8f02a..657fdf65e92c9d 100755 --- a/paddle/.set_python_path.sh +++ b/paddle/.set_python_path.sh @@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then esac done shift $(($OPTIND - 1)) - export PYTHONPATH=$PYPATH + export PYTHONPATH=$PYPATH:$PYTHONPATH $@ else echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment." diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt old mode 100644 new mode 100755 index e03a9a1baa0041..cdb730bb3cec7a --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -2,10 +2,17 @@ set(AVX_SOURCES src/hl_math.cc src/hl_avx_functions.cc ) -set(CUDA_SOURCES - src/hl_time.cc - src/hl_cpu_functions.cc - ${AVX_SOURCES}) + +if(WITH_AVX) + set(CUDA_SOURCES + src/hl_time.cc + src/hl_cpu_functions.cc + ${AVX_SOURCES}) +else() + set(CUDA_SOURCES + src/hl_time.cc + src/hl_cpu_functions.cc) +endif() set(CUDA_CXX_WITH_GPU_SOURCES src/hl_cuda_cublas.cc diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h index 77e2649b172144..1fe2774cc5a291 100644 --- a/paddle/cuda/include/hl_base.h +++ b/paddle/cuda/include/hl_base.h @@ -185,7 +185,7 @@ typedef struct { size_t nnz; } _hl_sparse_matrix_s, *hl_sparse_matrix_s; -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE /** * HPPL data type: real (float or double) * diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index aa4720f6ca749f..b5240da0f398c8 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -169,7 +169,7 @@ extern void hl_avgpool_forward( * @brief Maximum pool backward. * * @param[in] frameCnt batch size of input image. - * @param[in] outGrad input data. + * @param[in] outGrad output grad data. * @param[in] channels number of channel. * @param[in] height image height. * @param[in] width image width. @@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad, const size_t outputW, const size_t numChannels); +/** + * @brief MaxOut forward. + * + * @param[in] inData input data. + * @param[out] outData output data. + * @param[out] idData output maxId. + * @param[in] batchSize batchSize. + * @param[in] size number of channels * image height * image width. + * @param[in] featLen feature length = image height * image width. + * @param[in] groups number of groups. + */ +extern void hl_maxout_forward( + const real* inData, real* outData, int* idData, + size_t batchSize, size_t size, size_t featLen, size_t groups); + +/** + * @brief MaxOut backward. + * + * @param[out] inGrad input grad data. + * @param[in] outGrad output grad data. + * @param[in] idData output maxId. + * @param[in] batchSize batchSize. + * @param[in] size number of channels * image height * image width. + * @param[in] featLen feature length = image height * image width. + * @param[in] groups number of groups. + */ +extern void hl_maxout_backward( + real* inGrad, const real* outGrad, const int* idData, + size_t batchSize, size_t size, size_t featLen, size_t groups); + #endif /* HL_CNN_H_ */ diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh index cba1c9f30da8d5..d39cf67448b4f2 100644 --- a/paddle/cuda/include/hl_cpu_gru.cuh +++ b/paddle/cuda/include/hl_cpu_gru.cuh @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/math/MathFunctions.h" -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE #define CBLAS_GEMM paddle::gemm #else #define CBLAS_GEMM paddle::gemm diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/cuda/include/hl_gpu_functions.cuh index 38df4eb8958f21..a2c5ebd18a4403 100644 --- a/paddle/cuda/include/hl_gpu_functions.cuh +++ b/paddle/cuda/include/hl_gpu_functions.cuh @@ -28,7 +28,7 @@ namespace hppl { const real min = SIGMOID_THRESHOLD_MIN; const real max = SIGMOID_THRESHOLD_MAX; real tmp = (a < min) ? min : ((a > max) ? max : a); -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE return __fdividef(1.0f, 1.0f + __expf(-tmp)); #else return 1.0 / (1.0 + exp(-tmp)); @@ -36,7 +36,7 @@ namespace hppl { } __device__ static real tanh(const real a) { -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f; #else return (2.0 / (1.0 + exp(-2.0*a))) - 1.0; diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh index 473d394c0c688d..a3645ef51e6ef7 100644 --- a/paddle/cuda/include/hl_matrix_base.cuh +++ b/paddle/cuda/include/hl_matrix_base.cuh @@ -30,7 +30,7 @@ limitations under the License. */ #define INLINE inline #endif -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE #define DEVICE_FMAX fmaxf #define DEVICE_FMIN fminf #else diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh index 6917f362901411..51e483d1fb2ff3 100644 --- a/paddle/cuda/include/hl_matrix_type.cuh +++ b/paddle/cuda/include/hl_matrix_type.cuh @@ -21,7 +21,7 @@ limitations under the License. */ #ifdef __CUDA_ARCH__ // typedef void* vecType; #include -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE typedef float4 vecType; #else typedef double2 vecType; @@ -30,7 +30,7 @@ typedef double2 vecType; #include #include #include -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE typedef __m128 vecType; #else typedef __m128d vecType; diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h index 828c21beb2fbd4..46d86b2982f065 100644 --- a/paddle/cuda/include/hl_sequence.h +++ b/paddle/cuda/include/hl_sequence.h @@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad, */ extern void hl_sequence2batch_copy(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount, bool seq2batch); diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh index c90d49e4adeb5e..45db2f313e0d6e 100644 --- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh +++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh @@ -20,7 +20,7 @@ limitations under the License. */ #define VECTOR_SIZE 16 -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE /* number of float in vector */ #define VECTOR_LEN 4 #define VECTOR_SET _mm_set_ps1 @@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) { return hl_check_align(reinterpret_cast(ptr)); } -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE template inline real hl_agg_op(Agg agg, vecType mm) { __m128 lo = _mm_unpacklo_ps(mm, mm); diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index aa9442fb80237e..cf79fad9004cd8 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad, const size_t outputW, const size_t numChannels) {} +inline void hl_maxout_forward( + const real* inData, real* outData, int* idData, + size_t batchSize, size_t size, size_t featLen, size_t group) {} + +inline void hl_maxout_backward( + real* inGrad, const real* outGrad, const int* idData, + size_t batchSize, size_t size, size_t featLen, size_t group) {} + #endif // HL_CNN_STUB_H_ diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h index 417f40e0a69f6c..aabd956c37f7dc 100644 --- a/paddle/cuda/include/stub/hl_sequence_stub.h +++ b/paddle/cuda/include/stub/hl_sequence_stub.h @@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad, inline void hl_sequence2batch_copy(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount, bool seq2batch) {} diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index f965adc13575c1..499b61195af5e1 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad, threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW); CHECK_SYNC("hl_bilinear_backward failed"); -} \ No newline at end of file +} + +__global__ void maxoutFpCompute(size_t nthreads, const real * inData, + real * outData, int* idData, + size_t size, size_t featLen, size_t groups) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index < nthreads) { + size_t batch_idx = index / size; + size_t i = index % size; + size_t channel_idx = i / featLen; + size_t feat_idx = i % featLen; + size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx; + real max = inData[data_idx]; + int maxId = 0; + for (size_t g = 1; g < groups; ++g) { + real tmp = inData[data_idx + g * featLen]; + if (tmp > max) { + max = tmp; + maxId = g; + } + } + outData[index] = max; + idData[index] = maxId; + } +} + +void hl_maxout_forward(const real* inData, real* outData, + int* idData, size_t batchSize, size_t size, + size_t featLen, size_t groups) { + int num_kernels = size * batchSize; + int blocks = (num_kernels + 1024 - 1) / 1024; + maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>( + num_kernels, inData, outData, idData, size, featLen, groups); + CHECK_SYNC("hl_maxout_forward failed"); +} + +__global__ void maxoutBpCompute(size_t nthreads, real* inGrad, + const real* outGrad, const int* idData, + size_t size, size_t featLen, size_t groups) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index < nthreads) { + size_t batch_idx = index / size; + size_t i = index % size; + size_t channel_idx = i / featLen; + size_t feat_idx = i % featLen; + size_t newIndex = batch_idx * size; + size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx; + (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i]; + } +} + +void hl_maxout_backward(real* inGrad, const real* outGrad, + const int* idData, size_t batchSize, size_t size, + size_t featLen, size_t groups) { + int num_kernels = size * batchSize; + int blocks = (num_kernels + 1024 - 1) / 1024; + maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>( + num_kernels, inGrad, outGrad, idData, size, featLen, groups); + CHECK_SYNC("hl_maxout_backward failed"); +} diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc index dc109487ded20f..b3c9001ba39736 100644 --- a/paddle/cuda/src/hl_cuda_cublas.cc +++ b/paddle/cuda/src/hl_cuda_cublas.cc @@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) } /* namespace dynload */ -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE #define CUBLAS_GEAM dynload::cublasSgeam #define CUBLAS_GEMV dynload::cublasSgemv #define CUBLAS_GEMM dynload::cublasSgemm diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index c2dce1977bdf5d..b215c0f6e33a18 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc, (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); CHECK_NOTNULL(hl_desc); -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; @@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) { (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); CHECK_NOTNULL(hl_desc); -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; @@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter, CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc)); -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; @@ -921,7 +921,7 @@ void hl_softmax_forward(real *input, int height, int width) { -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; @@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value, int height, int width) { -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index f4c07367b485b8..e9fe9f1c117a05 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) { void hl_rand(real *dest_d, size_t num) { pthread_mutex_lock(t_resource.gen_mutex); CHECK_EQ( -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE dynload::curandGenerateUniform(t_resource.gen, dest_d, num), #else dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num), diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu index 38e4f16217c2a4..067e68c41e1198 100644 --- a/paddle/cuda/src/hl_cuda_matrix.cu +++ b/paddle/cuda/src/hl_cuda_matrix.cu @@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d, CHECK_SYNC("hl_matrix_add failed"); } -#ifdef HPPL_TYPE_DOUBLE +#ifdef PADDLE_TYPE_DOUBLE #define THRESHOLD 128 #else #define THRESHOLD 64 @@ -102,7 +102,7 @@ void subMaxAndExp(real* I, val = -THRESHOLD; } I[nextIdx] = val; -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE O[nextIdx] = __expf(val); #else O[nextIdx] = exp(val); diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index e028880156e5b1..63824eaa4c201c 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -374,7 +374,7 @@ template __global__ void KeSequence2Batch(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount) { int idx = threadIdx.x; @@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch, void hl_sequence2batch_copy(real *batch, real *sequence, - int *batchIndex, + const int *batchIndex, int seqWidth, int batchCount, bool seq2batch) { diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh index 13e89390d68c22..c3b98f4ebc38db 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cuh +++ b/paddle/cuda/src/hl_cuda_sparse.cuh @@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, } /* best perf */ -#ifndef HPPL_TYPE_DOUBLE +#ifndef PADDLE_TYPE_DOUBLE #define CU_CSCMM_THREAD_M_BEST 9 #else #define CU_CSCMM_THREAD_M_BEST 4 diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp index c3b4769f7612b7..8cefbb30ada46d 100644 --- a/paddle/gserver/dataproviders/DataProvider.cpp +++ b/paddle/gserver/dataproviders/DataProvider.cpp @@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) { } } -DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu, +DoubleBuffer::DoubleBuffer(DataProvider *dataPool, + bool useGpu, int64_t batchSize) { batchSize_ = batchSize; dataPool_ = dataPool; @@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) { } void DoubleBuffer::insertOneBatch(DataBatch* batch) { + while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) { // time out + if (stopping_) return; + } BufferBatch* bufBatch = bufferQueue_->dequeue(); // clone and copy the data from an Threadlocal Variable bufBatch->clone(batch, useGpu_); @@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() { actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch); } insertOneBatch(&newBatch); - } while (actualSize > 0); + } while (actualSize > 0 && !stopping_); } } diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h index 534491d70d5467..112e45de1cb232 100644 --- a/paddle/gserver/dataproviders/DataProvider.h +++ b/paddle/gserver/dataproviders/DataProvider.h @@ -259,7 +259,9 @@ typedef Queue BufferBatchQueue; class DoubleBuffer { public: - DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0); + DoubleBuffer(DataProvider* dataPool, + bool useGpu, + int64_t batchSize = 0); virtual ~DoubleBuffer(); void removeOneBatch(DataBatch* dataBatch); @@ -308,7 +310,8 @@ class DataProvider { /** * @brief create only used for unittest. */ - inline static DataProvider* create(const DataConfig &config, bool useGpu) { + inline static DataProvider* create(const DataConfig &config, + bool useGpu = FLAGS_use_gpu) { return create(config, ModelConfig(), useGpu); } @@ -348,7 +351,6 @@ class DataProvider { */ virtual void reset() { if (doubleBuffer_ != nullptr) { - LOG(INFO) << "the double-buffer is starting ..."; doubleBuffer_->startAsyncLoad(); } } diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp index 2f9a1223c6e454..ca8b07af49ca07 100644 --- a/paddle/gserver/dataproviders/PyDataProvider2.cpp +++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp @@ -14,13 +14,20 @@ limitations under the License. */ #ifndef PADDLE_NO_PYTHON +#include #include #include #include #include +#include +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include #include "DataProvider.h" + #include "paddle/utils/PythonUtil.h" +#include "paddle/utils/Locks.h" +#include "paddle/utils/Stat.h" namespace paddle { @@ -202,7 +209,10 @@ class PyDataProvider2 : public DataProvider { PyDataProvider2(const DataConfig& config, const ModelConfig& modelConfig, bool useGpu) - :DataProvider(config, useGpu), callingContextCreated_(2) { + :DataProvider(config, useGpu), + callingContextCreated_(2) { + if (PyArray_API == NULL) + import_array(); auto& args = config.load_data_args(); PyObjectPtr kwargs = PyObjectPtr(PyDict_New()); if (!args.empty()) { @@ -246,8 +256,7 @@ class PyDataProvider2 : public DataProvider { PyObjectPtr && kwargs) { LOG(INFO) << "loading dataprovider " << model <<"::" << className; - PyObjectPtr module(PyImport_ImportModule(model.c_str())); - CHECK_PY(module) << "Cannot imort module " << model.c_str(); + PyObjectPtr module = py::import(model); PyObjectPtr moduleDict(PyModule_GetDict(module.get())); CHECK_PY(moduleDict) << "Invoke module.__dict__ error"; PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), @@ -455,6 +464,7 @@ class PyDataProvider2 : public DataProvider { std::condition_variable pushCV_; std::condition_variable pullCV_; std::mutex mtx_; + ThreadBarrier callingContextCreated_; std::unique_ptr cache_; @@ -497,8 +507,8 @@ class PyDataProvider2 : public DataProvider { * Resetting the PyDataProvider. May start reading thread here. */ virtual void reset() { - DataProvider::reset(); resetImpl(true); + DataProvider::reset(); } /** @@ -519,6 +529,7 @@ class PyDataProvider2 : public DataProvider { * Loading a batch of data. */ int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) { + REGISTER_TIMER("PyDP2.getNextBatchInternal") CHECK_GE(size_, 0); size_t size = (size_t) size_; if (loadThread_) { // loading from thread should wait for data pool ready. @@ -699,10 +710,22 @@ class DenseScanner: public IFieldScanner { */ virtual void fill(Argument &argument, PyObject *obj) { real* dat = argument.value->getData() + height_ * headerPtr_->dim; - py::SequenceHelper s(obj); - // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy. - for (size_t i=0; i < headerPtr_->dim; ++i) { - dat[i] = (real) s.getDouble(i); + if (PyArray_Check(obj)) { + auto dtype = PyArray_DTYPE((PyArrayObject*)obj); + if (dtype->type == 'f' && dtype->elsize == sizeof(real)) { + real * data = (real*)PyArray_DATA((PyArrayObject*)obj); + auto sz = PyArray_SIZE((PyArrayObject*)obj); + std::copy(data, data + sz, dat); + } else { + LOG(FATAL) << "You should yield float" << sizeof(real) * 8 + << " array"; + } + } else { + py::SequenceHelper s(obj); + // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy. + for (size_t i=0; i < headerPtr_->dim; ++i) { + dat[i] = (real) s.getDouble(i); + } } ++height_; } diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp index 273925ba55ee40..22579891f397af 100644 --- a/paddle/gserver/evaluators/ChunkEvaluator.cpp +++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp @@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator { public: virtual void init(const EvaluatorConfig& config) { - CHECK(!FLAGS_use_gpu) << "Not supported"; Evaluator::init(config); if (config.chunk_scheme() == "IOB") { numTagTypes_ = 2; @@ -137,6 +136,7 @@ class ChunkEvaluator : public Evaluator { CHECK_EQ(arguments.size(), (size_t)2); IVectorPtr& output = arguments[0].ids; IVectorPtr& label = arguments[1].ids; + CHECK(!output->useGpu() && !label->useGpu()) << "Not supported"; auto sequenceStartPositions = arguments[1].sequenceStartPositions->getVector(false); CHECK_EQ(output->getSize(), label->getSize()); diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 787ce703a08aef..0ded30eeb44e95 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse( para->getMat(PARAMETER_GRADIENT).get()); std::vector& ids = mainMat->getIds(threadId_); - ids.clear(); for (auto slaveParams : slaveParameters) { SparseRowCpuMatrix* mat = dynamic_cast((*slaveParams)[pid] diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index fc38bca3c403b2..340cd1b9f8e927 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, const std::vector inArgs; std::vector outArgs; frames_[i]->forward(inArgs, &outArgs, passType); + if (hasSubseq) { + for (auto& outFrameLine : outFrameLines_) { + CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions) + << "In hierachical RNN, all out links should be from sequences."; + } + } } if (evaluator_ && passType == PASS_TEST) { this->eval(evaluator_.get()); @@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId, std::vector sequenceStartPositions; const int* subSequenceStartPositions = nullptr; - if (hasSubseq) { // for sequenceScatterAgentLayer - subSequenceStartPositions = - input.subSequenceStartPositions->getData(false); + if (hasSubseq) { // for sequenceScatterAgentLayer + subSequenceStartPositions = input.subSequenceStartPositions->getData(false); inlinkInfo->seqStartPosIndex.clear(); inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 } // maxSequenceLength_: max topLevelLength in allsamples for (int i = 0; i < maxSequenceLength_; ++i) { if (hasSubseq) { - sequenceStartPositions.push_back(0); // first element = 0 + sequenceStartPositions.push_back(0); // first element = 0 } int numSeqs = 0; for (size_t j = 0; j < numSequences; ++j) { @@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId, } if (hasSubseq) { // inFrameLine create sequenceStartPositions one time - CHECK_EQ(sequenceStartPositions.size(), - static_cast(maxSequenceLength_ + - input.getNumSubSequences())); + CHECK_EQ( + sequenceStartPositions.size(), + static_cast(maxSequenceLength_ + input.getNumSubSequences())); CHECK_EQ(inlinkInfo->seqStartPosIndex.size(), static_cast(maxSequenceLength_ + 1)); createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions); @@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector& newPaths, newPaths.end(), Path::greaterPath); newPaths.resize(totalExpandCount + minNewPathSize); - real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize, - newPaths.end())->logProb; - real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize, - newPaths.end())->logProb; + real minPathLogProb = + std::min_element(newPaths.end() - minNewPathSize, newPaths.end()) + ->logProb; + real maxPathLogProb = + std::max_element(newPaths.end() - minNewPathSize, newPaths.end()) + ->logProb; // Remove the already formed paths that are relatively short finalPaths_[seqId].erase( diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 056e9568852ac9..5e07446c71ff62 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "AgentLayer.h" #include "paddle/utils/Logging.h" @@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) { // get Arguments from real layers if (numSamples_ > 0 && numSamples_ < realNumSequences) { - int numRows = realOutput.sequenceStartPositions-> - getData(false)[numSamples_]; + int numRows = + realOutput.sequenceStartPositions->getData(false)[numSamples_]; CHECK(!realOutput.ids) << "Not supported"; output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_, /* trans */ false, /* seqFlag */ true, @@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) { int width = this->getSize(); if (realOutArg_.value || realOutArg_.ids) { - output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, - width, useGpu_); + output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width, + useGpu_); } else { // used in generation if (realLayer_->getOutput().ids) { IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); @@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { if (realOutArg_.value || realOutArg_.ids) { CHECK(realOutArg_.sequenceStartPositions); - output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, - width, useGpu_, /* trans */ false, /* seqFlag */ true, + output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width, + useGpu_, /* trans */ false, /* seqFlag */ true, /* seqStart */ seqStartPosIndex_, /* seqSize */ numSequences_); } else { @@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) { CHECK_NE(input.sequenceStartPositions.get(), output_.sequenceStartPositions.get()); ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions, - numSequences + 1, false); + numSequences + 1, false); int* outStarts = output_.sequenceStartPositions->getMutableData(false); - IVector::resizeOrCreate(cpuInputStartPos_, height, false); - int* inStarts = cpuInputStartPos_->getData(); + ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false); + int* inStarts = inputStartPos_->getMutableData(false); + size_t offsetOut = 0; for (size_t i = 0; i < numSequences; ++i) { outStarts[i] = offsetOut; @@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { } outStarts[numSequences] = offsetOut; - if (useGpu_) { - IVector::resizeOrCreate(inputStartPos_, height, true); - inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT); - } else { - inputStartPos_ = cpuInputStartPos_; - } - outputValue->copyByRowIndex(*input.value, *inputStartPos_); + outputValue->copyByRowIndex(*input.value, + *inputStartPos_->getVector(useGpu_)); } } diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h index d82078dd933294..3d7bf558340707 100644 --- a/paddle/gserver/layers/AgentLayer.h +++ b/paddle/gserver/layers/AgentLayer.h @@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer { protected: // use to store expanded cpuStartPositions or subSequenceStartPositions // of real layer. - IVectorPtr cpuInputStartPos_; - - // point to cpuInputStartPos_ when useGpu_ is false - // copy from cpuInputStartPos_ when useGpu_ is true - IVectorPtr inputStartPos_; + ICpuGpuVectorPtr inputStartPos_; public: explicit SequenceScatterAgentLayer(const LayerConfig& config) diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp index 374117b7659bbe..7401cdc9a516bb 100644 --- a/paddle/gserver/layers/AverageLayer.cpp +++ b/paddle/gserver/layers/AverageLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "AverageLayer.h" #include "paddle/utils/Logging.h" @@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer); bool AverageLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { - /* Initialize the basic parent class */ - Layer::init(layerMap, parameterMap); + SequencePoolLayer::init(layerMap, parameterMap); - /* initialize biases_ */ - if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_)); - } dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_); outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_); // average strategy @@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap, } else { LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy(); } - // transform to which sequence type - if (config_.trans_type() == "non-seq") { - type_ = kNonSeq; - } else if (config_.trans_type() == "seq") { - type_ = kSeq; - } else { - LOG(FATAL) << "Unknown trans_type: " << config_.trans_type(); - } - setNeedSequenceInfo(false); return true; } void AverageLayer::forward(PassType passType) { - Layer::forward(passType); - - // average layer should have exactly 1 input - CHECK_EQ(1U, inputLayers_.size()); - - size_t dim = getSize(); - const Argument& input = getInput(0); - int64_t newBatchSize = - type_ ? input.getNumSubSequences() : input.getNumSequences(); - ICpuGpuVectorPtr startPositions = - type_ ? input.subSequenceStartPositions - : input.sequenceStartPositions; - const int* starts = startPositions->getData(false); - size_t numSequences = startPositions->getSize() - 1; - - // check - CHECK_EQ(numSequences, (size_t)newBatchSize); - CHECK_EQ(starts[numSequences], input.getBatchSize()); - if (type_) { - // when trans_type = seq, input must hasSubseq - CHECK_EQ(input.hasSubseq(), 1UL); - } + SequencePoolLayer::forward(passType); - CHECK_EQ(dim, input.value->getWidth()); - - resetOutput(newBatchSize, dim); - auto startsPos = startPositions->getVector(useGpu_); MatrixPtr inputValue = getInputValue(0); - getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_); - - /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq, - * thus, in this case, output_ has no sequenceStartPositions. - * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this - * case, we should compute the new sequenceStartPositions. - */ - if (type_) { - output_.degradeSequence(input, useGpu_); - } + getOutputValue()->sequenceAvgForward( + *inputValue, *startPositions_->getVector(useGpu_), mode_); /* add the bias-vector AFTER average operation */ if (biases_.get() != NULL) { @@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) { } void AverageLayer::backward(const UpdateCallback& callback) { - const Argument& input = getInput(0); - ICpuGpuVectorPtr startPositions = - type_ ? input.subSequenceStartPositions - : input.sequenceStartPositions; - const int* starts = startPositions->getData(false); - /* Do derivation */ { backwardActivation(); } - - if (biases_ && biases_->getWGrad()) { - biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - - // Increasing the number of gradient - biases_->getParameterPtr()->incUpdate(callback); - } + SequencePoolLayer::backward(callback); + const int* starts = startPositions_->getData(false); MatrixPtr grad = getInputGrad(0); + if (grad) { size_t dim = getSize(); real* gradientData = getInputGrad(0)->getData(); real* gradient = getOutputGrad()->getData(); - size_t numSequences = startPositions->getSize() - 1; + size_t numSequences = startPositions_->getSize() - 1; for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) { // TODO(Dangqingqing) optimization for GPU int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h index ae910ddefad137..1edc2ace492c5b 100644 --- a/paddle/gserver/layers/AverageLayer.h +++ b/paddle/gserver/layers/AverageLayer.h @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once -#include "Layer.h" +#include "SequencePoolLayer.h" #include "paddle/math/Matrix.h" namespace paddle { @@ -23,20 +22,21 @@ namespace paddle { /** * A layer for "internal average" for sequence input. * Input: one or more sequences. Each sequence contains some instances. - * If AverageLevel = kNonSeq: + * If SequenceLevel = kNonSeq: * Output: output size is the number of input sequences (NOT input instances) * output[i] = average_{for each instance in this sequence}{input[i]} - * If AverageLevel = kSeq: + * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences * output[i] = average_{for each instance in this sub-sequence}{input[i]} + * + * The config file api is pooling_layer. */ - -class AverageLayer : public Layer { +class AverageLayer : public SequencePoolLayer { public: enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 }; - enum AverageLevel { kNonSeq = 0, kSeq = 1 }; - explicit AverageLayer(const LayerConfig& config) : Layer(config) {} + explicit AverageLayer(const LayerConfig& config) + : SequencePoolLayer(config) {} ~AverageLayer() {} @@ -46,11 +46,8 @@ class AverageLayer : public Layer { void backward(const UpdateCallback& callback = nullptr); protected: - std::unique_ptr biases_; MatrixPtr outMtx_; MatrixPtr dataMtx_; int mode_; - int type_; }; - } // namespace paddle diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp index bbd0b53273b430..9290ce4f6d46c1 100644 --- a/paddle/gserver/layers/ExpandLayer.cpp +++ b/paddle/gserver/layers/ExpandLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ExpandLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) { const Argument& shapeInput = getInput(1); const Argument& dataInput = getInput(0); size_t outputBatchSize = shapeInput.getBatchSize(); - auto startPositions = - type_ ? shapeInput.subSequenceStartPositions - : shapeInput.sequenceStartPositions; + auto startPositions = type_ ? shapeInput.subSequenceStartPositions + : shapeInput.sequenceStartPositions; size_t numSequences = startPositions->getSize() - 1; const int* starts = startPositions->getData(false); @@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) { // set output sequence info as shape sequence output_.sequenceStartPositions = shapeInput.sequenceStartPositions; if (shapeInput.hasSubseq()) { - output_.subSequenceStartPositions = - shapeInput.subSequenceStartPositions; + output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions; } // reserve output: Expand output to batchsize of sequence data. @@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) { MatrixPtr inputValue = getInputValue(0); MatrixPtr outputValue = getOutputValue(); - IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false); - int* expandStarts = cpuExpandStartsPos_->getData(); + ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false); + int* expandStarts = expandStartsPos_->getMutableData(false); for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) { int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; for (int j = 0; j < sequenceLength; j++) { @@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) { } } - if (useGpu_) { - // TODO(Dangqingqing) move copyFrom - IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true); - expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT); - } else { - expandStartsPos_ = cpuExpandStartsPos_; - } - - outputValue->copyByRowIndex(*inputValue, *expandStartsPos_); + outputValue->copyByRowIndex(*inputValue, + *expandStartsPos_->getVector(useGpu_)); if (biases_.get() != NULL) { outputValue->addBias(*(biases_->getW()), 1); @@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) { void ExpandLayer::backward(const UpdateCallback& callback) { if (biases_ && biases_->getWGrad()) { biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - /* Increasing the number of gradient */ + /* Increasing the number of gradient */ biases_->getParameterPtr()->incUpdate(callback); } if (!getInputGrad(0)) return; MatrixPtr inputGrad = getInputGrad(0); MatrixPtr outputGrad = getOutputGrad(); - auto cpuSeqStartPos = - type_ ? getInput(1).subSequenceStartPositions - : getInput(1).sequenceStartPositions; + auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions + : getInput(1).sequenceStartPositions; size_t numSequences = cpuSeqStartPos->getSize() - 1; const int* starts = cpuSeqStartPos->getData(false); diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h index 8a3eb1c973a475..fbe0ced9b1754d 100644 --- a/paddle/gserver/layers/ExpandLayer.h +++ b/paddle/gserver/layers/ExpandLayer.h @@ -44,14 +44,9 @@ class ExpandLayer : public Layer { enum ExpandLevel { kNonSeq = 0, kSeq = 1 }; /// store the ExpandLevel int type_; - // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_ - // and expandStartsPos_ /// expanded sequenceStartPositions or subSequenceStartPositions /// of input[1] - IVectorPtr cpuExpandStartsPos_; - /// point to cpuExpandStartsPos_ when useGpu_ is false, - /// copy from cpuExpandStartsPos_ when useGpu_ is true - IVectorPtr expandStartsPos_; + ICpuGpuVectorPtr expandStartsPos_; public: explicit ExpandLayer(const LayerConfig& config) : Layer(config) {} diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp index 226e0ea87dbd4a..c4ffe894eccd61 100644 --- a/paddle/gserver/layers/MaxLayer.cpp +++ b/paddle/gserver/layers/MaxLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "MaxLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -21,55 +20,11 @@ namespace paddle { REGISTER_LAYER(max, MaxLayer); -bool MaxLayer::init(const LayerMap& layerMap, - const ParameterMap& parameterMap) { - /* Initialize the basic parent class */ - Layer::init(layerMap, parameterMap); - - /* initialize biases_ */ - if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_)); - } - - // transform to which sequence type - if (config_.trans_type() == "non-seq") { - type_ = kNonSeq; - } else if (config_.trans_type() == "seq") { - type_ = kSeq; - } else { - LOG(FATAL) << "Unknown trans_type: " << config_.trans_type(); - } - setNeedSequenceInfo(false); - return true; -} - void MaxLayer::forward(PassType passType) { - Layer::forward(passType); - // max layer should have exactly 1 input - CHECK_EQ(1U, inputLayers_.size()); - - size_t dim = getSize(); - const Argument& input = getInput(0); - int64_t newBatchSize = - type_ ? input.getNumSubSequences() : input.getNumSequences(); - ICpuGpuVectorPtr startPositions = - type_ ? input.subSequenceStartPositions - : input.sequenceStartPositions; - auto starts = startPositions->getVector(useGpu_); - size_t numSequences = startPositions->getSize() - 1; + SequencePoolLayer::forward(passType); - CHECK_EQ(dim, input.value->getWidth()); - CHECK_EQ(numSequences, (size_t)newBatchSize); - CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize()); - if (type_) { - // when trans_type = seq, input must hasSubseq - CHECK_EQ(input.hasSubseq(), 1UL); - } - - // reset output: resize to "num of sequences", not "batch size". - resetOutput(newBatchSize, dim); - - IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_)); + IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(), + useGpu(deviceId_)); maxIndex_->zeroMem(); MatrixPtr inputValue = getInputValue(0); @@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) { { REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str()); - outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_); - } - - /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq, - * thus, in this case, output_ has no cpuSequenceStartPositions. - * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this - * case, we should compute the new cpuSequenceStartPositions. - */ - if (type_) { - output_.degradeSequence(input, useGpu_); + outputValue->maxSequenceForward( + *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_); } if (config_.output_max_index()) { @@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) { void MaxLayer::backward(const UpdateCallback& callback) { CHECK(!config_.output_max_index()) << "backward is not available when output_max_index is set"; - /* Do derivation */ { backwardActivation(); } - - if (biases_ && biases_->getWGrad()) { - biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - - // Increasing the number of gradient - biases_->getParameterPtr()->incUpdate(callback); - } + SequencePoolLayer::backward(callback); MatrixPtr inputGrad = getInputGrad(0); MatrixPtr outputGrad = getOutputGrad(); if (inputGrad) { - ICpuGpuVectorPtr starts = - type_ ? getInput(0).subSequenceStartPositions - : getInput(0).sequenceStartPositions; REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str()); - inputGrad->maxSequenceBackward(*outputGrad, - *(starts->getVector(useGpu_)), *maxIndex_); + inputGrad->maxSequenceBackward( + *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_); } } diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h index b4c34e665d926d..e6dcfe9c6759d1 100644 --- a/paddle/gserver/layers/MaxLayer.h +++ b/paddle/gserver/layers/MaxLayer.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once -#include "Layer.h" +#include "SequencePoolLayer.h" #include "paddle/math/Matrix.h" #include "paddle/utils/ThreadLocal.h" @@ -24,29 +24,30 @@ namespace paddle { /** * A layer for "internal max" for sequence input. * Input: one or more sequences. Each sequence contains some instances. - * If MaxLevel = kNonSeq: + * If SequenceLevel = kNonSeq: * Output: output size is the number of input sequences (NOT input instances) * output[i] = max_{for each instance in this sequence}{input[i]} - * If MaxLevel = kSeq: + * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: output size is the number of input sub-sequences * output[i] = max_{for each instance in this sub-sequence}{input[i]} + * + * The config file api is pooling_layer. */ -class MaxLayer : public Layer { +class MaxLayer : public SequencePoolLayer { protected: - std::unique_ptr biases_; // maxIndex_[i][j] = k : the value at (i, j) is from input[k]. IVectorPtr maxIndex_; - int type_; public: - explicit MaxLayer(const LayerConfig& config) : Layer(config) {} - enum MaxLevel {kNonSeq = 0, kSeq = 1 }; + explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {} ~MaxLayer() {} - bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) { + return SequencePoolLayer::init(layerMap, parameterMap); + } void forward(PassType passType); void backward(const UpdateCallback& callback = nullptr); diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp new file mode 100644 index 00000000000000..a3de069bf7a6c9 --- /dev/null +++ b/paddle/gserver/layers/MaxOutLayer.cpp @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MaxOutLayer.h" +#include "hl_gpu.h" +#include "hl_cnn.h" + +namespace paddle { + +REGISTER_LAYER(maxout, MaxOutLayer); + +size_t MaxOutLayer::getSize() { + const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf(); + imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight(); + imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth(); + if (imgSizeH_ == 0) { + imgSizeH_ = maxoutConf.img_size_y(); + } + if (imgSizeW_ == 0) { + imgSizeW_ = maxoutConf.img_size_x(); + } + + featLen_ = imgSizeH_ * imgSizeW_; + size_t layerSize = featLen_ * outputChannels_; + + getOutput().setFrameHeight(imgSizeH_); + getOutput().setFrameWidth(imgSizeW_); + + return layerSize; +} + +bool MaxOutLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + + /* the size of inputs for maxout-layer is 1 */ + CHECK_EQ(config_.inputs_size(), 1); + + const MaxOutConfig& conf = config_.inputs(0).maxout_conf(); + groups_ = conf.groups(); + channels_ = conf.channels(); + CHECK_EQ(channels_ % groups_, 0UL); + outputChannels_ = channels_ / groups_; + + return true; +} + +void MaxOutLayer::forward(PassType passType) { + Layer::forward(passType); + + /* malloc memory for the output_ if necessary */ + /* note: one sample correspond to one column */ + size_t batchSize = getInput(0).getBatchSize(); + size_t size = getSize(); + resetOutput(batchSize, size); + MatrixPtr inputV = getInputValue(0); + MatrixPtr outV = getOutputValue(); + + IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_); + outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_); +} + +void MaxOutLayer::backward(const UpdateCallback& callback) { + (void)callback; + + /* Do derivation */ + MatrixPtr inputG = getInputGrad(0); + MatrixPtr outG = getOutputGrad(); + + if (inputG) { + inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h new file mode 100644 index 00000000000000..9011a5c332b17a --- /dev/null +++ b/paddle/gserver/layers/MaxOutLayer.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" +#include "paddle/math/Matrix.h" + +namespace paddle { + +/** + * A layer to do max out on conv layer output. + * Input: output of a conv layer. + * Output: feature map size same as input. Channel is (input channel) / groups. + * So the num of channels should be able to devided by groups. + * + * The config file api is maxout_layer. + */ + +class MaxOutLayer : public Layer { +protected: + size_t groups_; + size_t imgSizeH_, imgSizeW_; + /// outputChannels_ = channels_ / groups_ + size_t channels_, outputChannels_; + /// feature length = imgSizeH_ * imgSizeW_ + size_t featLen_; + IVectorPtr maxoutId_; + +public: + /// return imgSizeH_ * imgSizeW_ * outputChannels_; + size_t getSize(); + + explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {} + virtual ~MaxOutLayer() {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + void backward(const UpdateCallback& callback = nullptr); +}; + +} // namespace paddle diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp index a896e16a6027b3..4faebe5d2ad6f9 100644 --- a/paddle/gserver/layers/NCELayer.cpp +++ b/paddle/gserver/layers/NCELayer.cpp @@ -21,14 +21,18 @@ limitations under the License. */ namespace paddle { /** - * Noise-contrastive estimation + * Noise-contrastive estimation. * Implements the method in the following paper: - * A fast and simple algorithm for training neural probabilistic language models + * A fast and simple algorithm for training neural probabilistic language models. + * + * The config file api is nce_layer. */ class NCELayer : public Layer { int numClasses_; - int numInputs_; // number of input layer besides labelLayer and weightLayer + /// number of input layer besides labelLayer and weightLayer + int numInputs_; LayerPtr labelLayer_; + /// weight layer, can be None LayerPtr weightLayer_; WeightList weights_; std::unique_ptr biases_; @@ -43,7 +47,8 @@ class NCELayer : public Layer { real weight; }; std::vector samples_; - bool prepared_; // whether samples_ is prepared + /// whether samples_ is prepared + bool prepared_; Argument sampleOut_; IVectorPtr labelIds_; diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp index 12831e36688029..26d9536dd57aa3 100644 --- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp +++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/utils/Logging.h" -#include "Layer.h" +#include "SequencePoolLayer.h" #include "paddle/math/Matrix.h" #include "paddle/utils/Stat.h" @@ -29,20 +29,19 @@ namespace paddle { * If SequenceLevel = kSeq: * Check input sequence must has sub-sequence * Output: a sequence containing only the last instance of each sub-sequence - * of the input sequence + * of the input sequence + * + * The config file api is last_seq and first_seq. */ -class SequenceLastInstanceLayer : public Layer { +class SequenceLastInstanceLayer : public SequencePoolLayer { protected: - std::unique_ptr biases_; MatrixPtr tmpSrc_; MatrixPtr tmpDest_; - enum SequenceLevel { kNonSeq = 0, kSeq = 1 }; - int type_; public: explicit SequenceLastInstanceLayer(const LayerConfig& config) - : Layer(config) {} + : SequencePoolLayer(config) {} ~SequenceLastInstanceLayer() {} @@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer); bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { - /* Initialize the basic parent class */ - Layer::init(layerMap, parameterMap); - - // seqlastins layer should have exactly 1 input - CHECK_EQ(1U, inputLayers_.size()); - - /* initialize biases_ */ - if (biasParameter_.get() != NULL) { - biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_)); - } + SequencePoolLayer::init(layerMap, parameterMap); tmpSrc_ = Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_); tmpDest_ = Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_); - // transform to which sequence type - if (config_.trans_type() == "non-seq") { - type_ = kNonSeq; - } else if (config_.trans_type() == "seq") { - type_ = kSeq; - } else { - LOG(FATAL) << "Unknown trans_type: " << config_.trans_type(); - } - setNeedSequenceInfo(false); return true; } void SequenceLastInstanceLayer::forward(PassType passType) { - Layer::forward(passType); - - size_t dim = getSize(); - const Argument& input = getInput(0); - - // check - auto startPositions = - type_ ? input.subSequenceStartPositions->getVector(false) - : input.sequenceStartPositions->getVector(false); - size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences(); - CHECK_EQ(dim, input.value->getWidth()); - CHECK_EQ(startPositions->getData()[height], input.getBatchSize()); - CHECK_EQ(height, startPositions->getSize() - 1); - if (type_) { - // when trans_type = seq, input must hasSubseq - CHECK_EQ(input.hasSubseq(), 1UL); - } + SequencePoolLayer::forward(passType); - reserveOutput(height, dim); - const int* starts = startPositions->getData(); + const int* starts = startPositions_->getData(false); MatrixPtr inputValue = getInputValue(0); MatrixPtr outputValue = getOutputValue(); @@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) { AsyncGpuBlock asyncGpuBlock; REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str()); - for (size_t seqId = 0; seqId < height; ++seqId) { + for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) { int insId = config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1; outputValue->subMatrix(seqId, 1, tmpDest_) ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_))); } - /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq, - * thus, in this case, output_ has no sequenceStartPositions. - * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this - * case, we should compute the new sequenceStartPositions. - */ - if (type_) { - output_.degradeSequence(input, useGpu_); - } } if (biases_.get() != NULL) { @@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) { } void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) { - /* activation, should set to 'linear' in most cases */ - backwardActivation(); - - if (biases_ && biases_->getWGrad()) { - biases_->getWGrad()->collectBias(*getOutputGrad(), 1); - - // Increasing the number of gradient - biases_->getParameterPtr()->incUpdate(callback); - } + SequencePoolLayer::backward(callback); MatrixPtr inputGrad = getInputGrad(0); MatrixPtr outputGrad = getOutputGrad(); - auto startPositions = - type_ ? getInput(0).subSequenceStartPositions->getVector(false) - : getInput(0).sequenceStartPositions->getVector(false); - const int* starts = startPositions->getData(); - size_t numSequences = startPositions->getSize() - 1; + const int* starts = startPositions_->getData(false); + size_t numSequences = startPositions_->getSize() - 1; if (inputGrad) { AsyncGpuBlock asyncGpuBlock; diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp new file mode 100644 index 00000000000000..55be73d363df19 --- /dev/null +++ b/paddle/gserver/layers/SequencePoolLayer.cpp @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/utils/Logging.h" +#include "SequencePoolLayer.h" + +namespace paddle { + +bool SequencePoolLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + + // seqlastins/max/average layer should have exactly 1 input + CHECK_EQ(1U, inputLayers_.size()); + + /* initialize biases_ */ + if (biasParameter_.get() != NULL) { + biases_ = std::unique_ptr(new Weight(1, getSize(), biasParameter_)); + } + // transform to which sequence type + if (config_.trans_type() == "non-seq") { + type_ = kNonSeq; + } else if (config_.trans_type() == "seq") { + type_ = kSeq; + } else { + LOG(FATAL) << "Unknown trans_type: " << config_.trans_type(); + } + setNeedSequenceInfo(false); + return true; +} + +void SequencePoolLayer::forward(PassType passType) { + Layer::forward(passType); + + const Argument& input = getInput(0); + newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences(); + size_t dim = getSize(); + // check + CHECK_EQ(dim, input.value->getWidth()); + startPositions_ = + type_ ? input.subSequenceStartPositions : input.sequenceStartPositions; + auto starts = startPositions_->getVector(false); + CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize()); + CHECK_EQ(newBatchSize_, starts->getSize() - 1); + + resetOutput(newBatchSize_, dim); + if (type_) { + CHECK(input.subSequenceStartPositions) + << "when trans_type = seq, input must hasSubseq"; + } + /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq, + * thus, in this case, output_ has no sequenceStartPositions. + * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this + * case, we should compute the new sequenceStartPositions. + */ + if (type_) { + output_.degradeSequence(input, useGpu_); + } +} + +void SequencePoolLayer::backward(const UpdateCallback& callback) { + /* Do derivation */ { backwardActivation(); } + + if (biases_ && biases_->getWGrad()) { + biases_->getWGrad()->collectBias(*getOutputGrad(), 1); + + // Increasing the number of gradient + biases_->getParameterPtr()->incUpdate(callback); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h new file mode 100644 index 00000000000000..669af80e1d447a --- /dev/null +++ b/paddle/gserver/layers/SequencePoolLayer.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" +#include "paddle/math/Matrix.h" + +namespace paddle { +/** + * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer. + * + * Input: one or more sequences. Each sequence contains some instances. + * If SequenceLevel = kNonSeq: + * Output: output size is the number of input sequences (NOT input instances) + * output[i] = seqlastin/average/max_{for each instance in this + * sequence}{input[i]} + * If SequenceLevel = kSeq: + * Check input sequence must has sub-sequence + * Output: output size is the number of input sub-sequences + * output[i] = seqlastin/average/max_{for each instance in this + * sub-sequence}{input[i]} + * + * The config file api is pooling_layer. + */ + +class SequencePoolLayer : public Layer { +protected: + int type_; + std::unique_ptr biases_; + enum SequenceLevel { kNonSeq = 0, kSeq = 1 }; + size_t newBatchSize_; + ICpuGpuVectorPtr startPositions_; + +public: + explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {} + + virtual ~SequencePoolLayer() {} + + bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + + void forward(PassType passType); + void backward(const UpdateCallback& callback = nullptr); +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py index 5c3b062309c51f..321c78cb1741bc 100644 --- a/paddle/gserver/tests/rnn_data_provider.py +++ b/paddle/gserver/tests/rnn_data_provider.py @@ -14,12 +14,15 @@ from paddle.trainer.PyDataProvider2 import * +# Note that each config should has an independent provider +# in current design of PyDataProvider2. +####################################################### data = [ [[[1, 3, 2], [4, 5, 2]], 0], [[[0, 2], [2, 5], [0, 1, 2]], 1], ] - +# Used for sequence_nest_rnn.conf @provider(input_types=[integer_value_sub_sequence(10), integer_value(3)], should_shuffle=False) @@ -27,7 +30,7 @@ def process_subseq(settings, file_name): for d in data: yield d - +# Used for sequence_rnn.conf @provider(input_types=[integer_value_sequence(10), integer_value(3)], should_shuffle=False) @@ -38,11 +41,32 @@ def process_seq(settings, file_name): seq += subseq yield seq, d[1] +# Used for sequence_nest_rnn_multi_input.conf +@provider(input_types=[integer_value_sub_sequence(10), + integer_value(3)], + should_shuffle=False) +def process_subseq2(settings, file_name): + for d in data: + yield d + +# Used for sequence_rnn_multi_input.conf +@provider(input_types=[integer_value_sequence(10), + integer_value(3)], + should_shuffle=False) +def process_seq2(settings, file_name): + for d in data: + seq = [] + for subseq in d[0]: + seq += subseq + yield seq, d[1] + +########################################################### data2 = [ [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0], [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1], ] +# Used for sequence_nest_rnn_multi_unequalength_inputs.conf @provider(input_types=[integer_value_sub_sequence(10), integer_value_sub_sequence(10), integer_value(2)], @@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name): yield d +# Used for sequence_rnn_multi_unequalength_inputs.conf @provider(input_types=[integer_value_sequence(10), integer_value_sequence(10), integer_value(2)], diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py index cbed1f15fc4157..b166e778d7a33f 100644 --- a/paddle/gserver/tests/sequenceGen.py +++ b/paddle/gserver/tests/sequenceGen.py @@ -21,7 +21,7 @@ def hook(settings, dict_file, **kwargs): settings.word_dict = dict_file settings.input_types = [integer_value_sequence(len(settings.word_dict)), - integer_value_sequence(3)] + integer_value(3)] settings.logger.info('dict len : %d' % (len(settings.word_dict))) @@ -34,14 +34,14 @@ def process(settings, file_name): words = comment.split() word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] - yield word_slot, [label] + yield word_slot, label ## for hierarchical sequence network def hook2(settings, dict_file, **kwargs): settings.word_dict = dict_file settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)), - integer_value_sub_sequence(3)] + integer_value_sequence(3)] settings.logger.info('dict len : %d' % (len(settings.word_dict))) @@ -57,7 +57,7 @@ def process2(settings, file_name): words = comment.split() word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict] - label_list.append([label]) + label_list.append(label) word_slot_list.append(word_slot) else: yield word_slot_list, label_list diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf index 62b8c5d072d7b4..93b08eb2f8746d 100644 --- a/paddle/gserver/tests/sequence_nest_rnn.conf +++ b/paddle/gserver/tests/sequence_nest_rnn.conf @@ -56,9 +56,8 @@ def outer_step(x): last = last_seq(input=inner_rnn_output, name="outer_rnn_state") # "return last" should also work. But currently RecurrentGradientMachine - # does not handle it correctly. Current implementation requires that - # all the out links are from sequences. However, it does not report error - # when the out links are not sequences. + # does not handle it, and will report error: In hierachical RNN, all out + # links should be from sequences now. return inner_rnn_output out = recurrent_group( diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf index e01b3f8e7aa5c4..0614958b4719dd 100644 --- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf +++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf @@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import * define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', test_list=None, module='rnn_data_provider', - obj='process_subseq') + obj='process_subseq2') settings(batch_size=2, learning_rate=0.01) @@ -57,9 +57,8 @@ def outer_step(wid, x): last = last_seq(input=inner_rnn_output, name="outer_rnn_state") # "return last" should also work. But currently RecurrentGradientMachine - # does not handle it correctly. Current implementation requires that - # all the out links are from sequences. However, it does not report error - # when the out links are not sequences. + # does not handle it, and will report error: In hierachical RNN, all out + # links should be from sequences now. return inner_rnn_output out = recurrent_group( diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf index 968621cab59be9..51881e21d971bb 100644 --- a/paddle/gserver/tests/sequence_rnn_multi_input.conf +++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf @@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import * define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', test_list=None, module='rnn_data_provider', - obj='process_seq') + obj='process_seq2') settings(batch_size=2, learning_rate=0.01) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 425d669206cce3..db48cc47a4a638 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) { } } +TEST(Layer, maxoutLayer) { + TestConfig config; + config.biasSize = 0; + config.layerConfig.set_type("maxout"); + + config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0}); + LayerInputConfig* input = config.layerConfig.add_inputs(); + MaxOutConfig* maxout = input->mutable_maxout_conf(); + + maxout->set_img_size_x(32); + maxout->set_img_size_y(32); + maxout->set_channels(4); + maxout->set_groups(2); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "maxout", 10, false, useGpu); + } +} void testFcLayer(string format, size_t nnz) { TestConfig config; config.biasSize = 4096; diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp index e75e53ab7f431a..6bf1e329251219 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.cpp +++ b/paddle/gserver/tests/test_PyDataProvider2.cpp @@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) { } TEST(PyDataProvider2, init_hook) { - paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle")); + paddle::PyObjectPtr pickle = paddle::py::import("pickle"); paddle::PyObjectPtr globals( PyModule_GetDict(PyImport_AddModule("__main__"))); PyDict_SetItemString(globals.get(), "pickle", pickle.get()); diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py index 145fe85cff7d88..71c3335231e521 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.py +++ b/paddle/gserver/tests/test_PyDataProvider2.py @@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename): yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)] -@provider(input_types=[index_slot(10), index_slot(10)]) +@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)}) def test_input_order(setting, filename): for _ in xrange(1000): yield { diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp index ae7f617371ca5f..d104db3e5b32d5 100644 --- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp +++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include @@ -24,7 +23,7 @@ limitations under the License. */ P_DECLARE_int32(seed); using namespace paddle; // NOLINT -using namespace std; // NOLINT +using namespace std; // NOLINT class TrainerForTest : public paddle::Trainer { public: void startTrain() { @@ -44,11 +43,10 @@ class TrainerForTest : public paddle::Trainer { */ size_t getTotalParameterSize() const { auto p = const_cast(this); - auto & params = p->getGradientMachine()->getParameters(); - return std::accumulate(params.begin(), params.end(), 0UL, - [](size_t a, const ParameterPtr& p){ - return a+p->getSize(); - }); + auto& params = p->getGradientMachine()->getParameters(); + return std::accumulate( + params.begin(), params.end(), 0UL, + [](size_t a, const ParameterPtr& p) { return a + p->getSize(); }); } }; diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 469255719701a0..602d7db035deb5 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) { copyFrom(matrix); } -void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) { +void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) { size_t height = getHeight(); size_t width = getWidth(); CHECK_EQ(b.getWidth(), width); real* dst = getData(); real* src = b.getData(); - int* index = rowIndex.getData(); + const int* index = rowIndex.getData(); hl_sequence2batch_copy(dst, src, index, width, height, true); } @@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) { max.maxCols(*this); } +void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) { + LOG(FATAL) << "Is not supported"; +} + +void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels, + size_t groups) { + CHECK(dynamic_cast(&a)); + CHECK(dynamic_cast(&id)); + CHECK_EQ(a.getHeight(), getHeight()); + + size_t size = getWidth(); + size_t batchSize = getHeight(); + const real* input = a.getData(); + real* output = getData(); + int* idForGpu = id.getData(); + + hl_maxout_forward(input, output, idForGpu, batchSize, size, + size / channels, groups); +} + +void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels, + size_t groups) { + CHECK(dynamic_cast(&a)); + CHECK(dynamic_cast(&id)); + CHECK_EQ(a.getHeight(), getHeight()); + + size_t size = a.getWidth(); + size_t batchSize = getHeight(); + real* input = getData(); + const real* output = a.getData(); + const int* idForGpu = id.getData(); + + hl_maxout_backward(input, output, idForGpu, batchSize, size, + size / channels, groups); +} + /*calulate the error of classification */ void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) { GpuMatrixPtr output_ptr = std::dynamic_pointer_cast(output); @@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) { } } -void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) { +void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) { size_t height = getHeight(); size_t width = getWidth(); CHECK_EQ(b.getWidth(), width); - int* index = rowIndex.getData(); + const int* index = rowIndex.getData(); for (size_t i = 0; i < height; i++) { CHECK_LT(static_cast(index[i]), b.getHeight()); real* src = b.getData() + index[i] * width; @@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) { max.maxCols(*this); } +void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) { + CHECK(isContiguous()); + CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal"; + size_t numSamples = getWidth(); + size_t beam = maxVal.getHeight(); + CHECK_EQ(maxIds.getSize(), numSamples * beam); + CHECK_EQ(maxVal.getWidth(), numSamples); + + real* a = getData(); + int* s = maxIds.getData(); + real* t = maxVal.getData(); + size_t dim = getHeight(); + for (size_t i = 0; i < numSamples; i++) { + std::vector> vec; + for (size_t j = 0; j < dim; j++) { + vec.push_back(std::pair(a[i + j * numSamples], j)); + } + + std::partial_sort( + vec.begin(), vec.begin() + beam, vec.end(), + [](const std::pair& l, const std::pair& r) { + return l.first > r.first; + }); + for (size_t j = 0; j < beam; j++) { + t[i + j * numSamples] = vec[j].first; + s[i + j * numSamples] = vec[j].second; + } + } +} + +void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels, + size_t groups) { + CHECK(dynamic_cast(&a)); + CHECK(dynamic_cast(&id)); + CHECK_EQ(a.getHeight(), getHeight()); + + size_t size = getWidth(); + size_t batchSize = getHeight(); + size_t featLen = size / channels; + const real* input = a.getData(); + int* idForCpu = id.getData(); + + MatrixPtr maxInMat, maxOutMat; + Matrix::resizeOrCreate(maxInMat, groups, size, false, false); + Matrix::resizeOrCreate(maxOutMat, 1, size, false, false); + + for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) { + size_t newIndex = batch_idx * size; + IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false); + + for (size_t i = 0; i < channels; ++i) { + size_t newFeatLen = i * featLen; + for (size_t j = 0; j < groups; ++j) { + maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen) + ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen, + featLen); + } + } + maxInMat->colMax(*tmpId, *maxOutMat); + this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat); + } +} + +void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels, + size_t groups) { + CHECK(dynamic_cast(&a)); + CHECK(dynamic_cast(&id)); + CHECK_EQ(a.getHeight(), getHeight()); + + size_t size = a.getWidth(); + size_t batchSize = getHeight(); + size_t featLen = size / channels; + size_t newFeatLen = groups * featLen; + real* inputG = getData(); + const real* outG = a.getData(); + int* idForCpu = id.getData(); + + for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) { + size_t newIndex = batch_idx * size; + int* idData = idForCpu + newIndex; + + for (size_t i = 0; i < size; ++i) { + int gradIdx = + idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen; + (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i]; + } + } +} + void CpuMatrix::rowNormalizeL1(Matrix& out) { CHECK(!out.useGpu()); diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index b4922d7e6f5469..9b16ceacbfe98a 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -253,7 +253,7 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "copy data from int vector only available on CpuMatrix."; } - virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) { + virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) { LOG(FATAL) << "Not implemented"; } @@ -493,16 +493,40 @@ class Matrix : public BaseMatrix { LOG(FATAL) << "Not implemeted"; } + /** + * set the max of each column of this to mat + */ virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; } + /** + * @brief Get the top k elements of each column of this matrix. + * + * The row ids and values of these elements are stored in + * maxIds and max respectively. where k is the size of maxIds. + * And note that the top k elements are not sorted. + */ + virtual void colMax(IVector& maxIds, Matrix& maxVal) { + LOG(FATAL) << "not implemented"; + } + + virtual void maxoutForward(Matrix& a, IVector& id, size_t channels, + size_t groups) { + LOG(FATAL) << "not implemented"; + } + + virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels, + size_t groups) { + LOG(FATAL) << "not implemented"; + } + virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; } /** * @brief Get the top k elements of each row of this matrix. * * The column ids and values of these elements are stored in - * maxIds and max respectively. Note that the top k - * elements are not sorted. + * maxIds and max respectively. where k is the size of maxIds. + * And note that the top k elements are not sorted. */ virtual void rowMax(IVector& maxIds, Matrix& max) { LOG(FATAL) << "Not implemented"; @@ -995,7 +1019,7 @@ class GpuMatrix : public Matrix { void copyFrom(const IVector& src); - void copyByRowIndex(Matrix& b, IVector& rowIndex); + void copyByRowIndex(Matrix& b, const IVector& rowIndex); MatrixPtr clone(size_t height, size_t width, bool useGpu = false); @@ -1101,6 +1125,9 @@ class GpuMatrix : public Matrix { void rowMax(Matrix& max); void rowMax(IVector& maxIds, Matrix& max); void colMax(Matrix& max); + void colMax(IVector& maxIds, Matrix& max); + void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups); + void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups); void oneHotCrossEntropy(Matrix& output, IVector& label); void oneHotCrossEntropyBp(Matrix& outputV, IVector& label); @@ -1271,7 +1298,7 @@ class CpuMatrix : public Matrix { void copyFrom(CpuSparseMatrix& src); - void copyByRowIndex(Matrix& b, IVector& rowIndex); + void copyByRowIndex(Matrix& b, const IVector& rowIndex); MatrixPtr clone(size_t height, size_t width, bool useGpu = false); @@ -1425,6 +1452,9 @@ class CpuMatrix : public Matrix { void rowMax(Matrix& max); void rowMax(IVector& maxIds, Matrix& maxVal); void colMax(Matrix& max); + void colMax(IVector& maxIds, Matrix& maxVal); + void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups); + void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups); void rowNormalizeL1(Matrix& out); void oneHotCrossEntropy(Matrix& output, IVector& label); diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp index 0b5de252258a96..6986624d25c7a4 100644 --- a/paddle/math/SparseRowMatrix.cpp +++ b/paddle/math/SparseRowMatrix.cpp @@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) { std::vector& localIndices = indexDictHandle_->localIndices; + for (size_t i = 0; i < len; i ++) { + CHECK_LT(*(ids + i), this->getHeight()) + << "id:" << *(ids + i) << "Height:" << this->getHeight() + << "sparse id value exceeds the max input dimension, " + << "it could be caused invalid input data samples"; + } localIndices.insert(localIndices.end(), ids, ids + len); } void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) { CpuSparseMatrix* mat = dynamic_cast(input.get()); - CHECK(mat) << "only support non value sparse matrix"; + CHECK(mat) << "only support sparse matrix"; addRows(reinterpret_cast(mat->getCols()), mat->getElementCnt()); } @@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) { int* index = ids->getData(); for (size_t i = 0; i < numSamples; ++i) { if (index[i] == -1) continue; - localIndices.push_back((unsigned int)index[i]); + + unsigned int id = (unsigned int)index[i]; + CHECK_LT(id, this->getHeight()) + << "id:" << id << "Height:" << this->getHeight() + << "sparse id value exceeds the max input dimension, " + << "it could be caused invalid input data samples"; + localIndices.push_back(id); } } diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 2ff19e7b3f87ca..2cc38b82306e2b 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) { } } +void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW, + int channels, int groups) { + int inWidth = imgSizeH * imgSizeW * channels; + int outChannels = channels / groups; + int outWidth = imgSizeH * imgSizeW * outChannels; + + // forward + MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false); + MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true); + + MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false); + MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true); + MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false); + + IVectorPtr id = CpuIVector::create(numSamples * outWidth, false); + IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true); + IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false); + + input->randomizeUniform(); + inputGpu->copyFrom(*input); + + target->maxoutForward(*input, *id, outChannels, groups); + targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups); + + // check + targetCheck->copyFrom(*targetGpu); + MatrixCheckErr(*target, *targetCheck); + idCheck->copyFrom(*idGpu); + VectorCheckEqual(*id, *idCheck); + + // backward + MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false); + MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); + + MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); + MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false, + true); + MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false, + false); + + inputGrad->randomizeUniform(); + targetGrad->randomizeUniform(); + inputGpuGrad->copyFrom(*inputGrad); + targetGpuGrad->copyFrom(*targetGrad); + + inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups); + inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups); + + // check + targetCheckGrad->copyFrom(*inputGpuGrad); + MatrixCheckErr(*inputGrad, *targetCheckGrad); +} + +TEST(Matrix, MaxOutFwdBwd) { + for (auto numSamples : {5, 10}) { + for (auto channels : {8, 16}) { + for (auto imgSizeH : {14, 28}) { + for (auto imgSizeW : {16, 30}) { + for (auto groups : {2, 4}) { + VLOG(3) << " numSamples=" << numSamples + << " channels=" << channels + << " imgSizeH=" << imgSizeH + << " imgSizeW=" << imgSizeW + << " groups=" << groups; + testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups); + } + } + } + } + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h index 2f9606dc680265..ff251fe89f9f88 100644 --- a/paddle/parameter/Parameter.h +++ b/paddle/parameter/Parameter.h @@ -146,6 +146,12 @@ class Parameter { } } + void enableBufType(ParameterType type) { + if (bufs_[type]) return; + bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_); + bufs_[type]->zeroMem(); + } + void enableIntType(ParameterType type, size_t intStoreSize = 0) { if (!intBufs_[type]) { SetDevice device(deviceId_); diff --git a/paddle/pserver/PserverForPython.h b/paddle/pserver/PserverForPython.h deleted file mode 100644 index 5bbeae8bd8b973..00000000000000 --- a/paddle/pserver/PserverForPython.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/pserver/ParameterClient.h" -#include "paddle/pserver/ParameterServer.h" -#include "paddle/parameter/Parameter.h" -#include - -namespace paddle { - -struct PyObjectDeleter { - void operator()(PyObject* obj) { - if (obj) { - Py_DECREF(obj); - } - } -}; - -class ParameterClientPy : public ParameterClient { -protected: - typedef std::unique_ptr PyObjectPtr; - - std::vector parameter_; - int initArgc_; - char** initArgv_; - -public: - ParameterClientPy(std::vector configs, int argc, - std::vector argv, bool useGpu) { - initArgc_ = argc; - initArgv_ = new char* [argc]; - for (int i = 0; i < argc; i++) { - initArgv_[i] = new char[argv[i].size()]; - strcpy(initArgv_[i], // NOLINT - argv[i].c_str()); // NOLINT TODO(yuyang18): use snprintf instead. - } - ParameterConfig pyConfig; - ParameterPtr param; - for (auto& config : configs) { - pyConfig.ParseFromString(config); - param.reset(new Parameter(pyConfig, useGpu)); - parameter_.push_back(param); - } - Py_Initialize(); - CHECK(Py_IsInitialized()); - } - - ~ParameterClientPy() { - delete initArgv_; - Py_Finalize(); - } - - Parameter getParameter(int idx) { return *(parameter_[idx].get()); } - - void initClientPy() { - initMain(initArgc_, initArgv_); - CHECK(init(parameter_)) << "Init Client Failed."; - } - - void setConfigPy(std::string config) { - OptimizationConfig optConfig; - optConfig.ParseFromString(config); - setConfig(optConfig); - } - - bool inStatusPy(int status) { return inStatus(PServerStatus(status)); } - - void setStatusPy(int status) { setStatus(PServerStatus(status)); } - - void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); } - - void sendParameterPy(int updateMode, int parameterType, int numSamples, - real cost, bool sendBackParameter) { - sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType), - int64_t(numSamples), real(cost), sendBackParameter); - } - - template - std::string asyncCallPy(const char* serviceName, const char* funcName, - const std::string in) { - ProtoIn protoIn; - ProtoOut protoOut; - std::mutex waitLock; - std::string data; - protoIn.ParseFromString(in); - waitLock.lock(); - auto callback = [&](ProtoOut* pOut, bool isSuccessful) { - if (isSuccessful) { - pOut->SerializeToString(&data); - } else { - LOG(INFO) << "Async Talk Failed."; - } - waitLock.unlock(); - }; - - ubClient_.asyncCall(serviceName, funcName, protoIn, - &protoOut, callback); - waitLock.lock(); - protoOut.SerializeToString(&data); - return data; - } -}; - -} // namespace paddle diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py index 0366bb636c704a..6d8f5da3e298fa 100644 --- a/paddle/py_paddle/dataprovider_converter.py +++ b/paddle/py_paddle/dataprovider_converter.py @@ -63,7 +63,8 @@ def __init__(self, input_type, pos): def scan(self, dat): self.extend_cols(dat) - self.__rows__.append(len(dat)) + self.__rows__.append(len(dat) + self.__rows__[-1]) + self.__height__ += 1 def extend_cols(self, dat): self.__cols__.extend(dat) diff --git a/paddle/scripts/travis/before_install.sh b/paddle/scripts/travis/before_install.linux.sh similarity index 100% rename from paddle/scripts/travis/before_install.sh rename to paddle/scripts/travis/before_install.linux.sh diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh new file mode 100755 index 00000000000000..f438e69b822aa4 --- /dev/null +++ b/paddle/scripts/travis/before_install.osx.sh @@ -0,0 +1,13 @@ +#!/bin/bash +brew update +brew tap homebrew/science +brew install python +sudo pip install --upgrade protobuf==2.6.0 +brew install homebrew/versions/protobuf260 --without-python +brew install cmake python glog gflags openblas wget md5sha1sum + +wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz +tar xf gtest.tar.gz +cd googletest-release-1.8.0/ +cmake . +make install diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index 3ea633be327027..a73c32344c8abe 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -1,7 +1,22 @@ #!/bin/bash source ./common.sh -cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -make -j `nproc` -env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`" +CMAKE_EXTRA="" +if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then + CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib" +fi + + +cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA} + +NPROC=1 +if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + NRPOC=`nproc` +elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then + NPROC=`sysctl -n hw.ncpu` +fi + + +make -j $NPROC +env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC" sudo make install sudo paddle version diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp index 91f7f4d29df938..d0fda1b6253e3e 100644 --- a/paddle/trainer/ThreadParameterUpdater.cpp +++ b/paddle/trainer/ThreadParameterUpdater.cpp @@ -20,6 +20,8 @@ limitations under the License. */ #include "paddle/math/SparseRowMatrix.h" #include "paddle/utils/Thread.h" +P_DECLARE_int32(trainer_count); + namespace paddle { SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig) @@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector& parameters) { false /*inPserver*/)); size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0; optimizers_[pid]->init(numRows, ¶->getConfig()); + if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) { + // For trainer_count=1, the gradient machine is NeuralNetwork, which does + // not create parameter buf for PARAMETER_GRADIENT for sparse update in + // Parameter::enableType(). But gradient parameter buf is still used + // in SgdThreadUpdater. We need to explicitly create it. + para->enableBufType(PARAMETER_GRADIENT); + } } } @@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse( // From MultiGradientMachine SparseRowIdsCpuMatrix* mainMat = dynamic_cast( para->getMat(PARAMETER_GRADIENT).get()); - const std::vector& sparseIds = mainMat->getIds(tid); + std::vector& sparseIds = mainMat->getIds(tid); for (auto id : sparseIds) { // setup sub bufs @@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse( optimizer->update(vecs, para->getConfig(), id); vecs[PARAMETER_GRADIENT]->zeroMem(); } + sparseIds.clear(); } else if (dynamic_cast( para->getMat(PARAMETER_GRADIENT).get())) { // From NeuralNetwork @@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse( optimizer->update(vecs, para->getConfig(), id); vecs[PARAMETER_GRADIENT]->zeroMem(); } + // For numThreads > 1, MultiGradientMachine is used, which goes + // to the above branch. + CHECK_EQ(numThreads, 1UL); + mainMat->clearIndices(); } else { auto & m = *para->getMat(PARAMETER_GRADIENT).get(); LOG(FATAL) << "Internal error: " << para->getName() << " " diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf index 5d2e2ba9df5c71..664e18cb986811 100644 --- a/paddle/trainer/tests/test_config.conf +++ b/paddle/trainer/tests/test_config.conf @@ -13,157 +13,71 @@ # See the License for the specific language governing permissions and # limitations under the License. -#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. - -default_initial_std(0.5) - -model_type("nn") - -DataLayer( - name = "input", - size = 3, -) - -DataLayer( - name = "weight", - size = 1, -) - -Layer( - name = "layer1_1", - type = "fc", - size = 5, - active_type = "sigmoid", - inputs = "input", -) - -Layer( - name = "layer1_2", - type = "fc", - size = 12, - active_type = "linear", - inputs = Input("input", parameter_name='sharew'), -) - -Layer( - name = "layer1_3", - type = "fc", - size = 3, - active_type = "tanh", - inputs = "input", -) - -Layer( - name = "layer1_5", - type = "fc", - size = 3, - active_type = "tanh", - inputs = Input("input", - learning_rate=0.01, - momentum=0.9, - decay_rate=0.05, - initial_mean=0.0, - initial_std=0.01, - format = "csc", - nnz = 4) -) - -FCLayer( - name = "layer1_4", - size = 5, - active_type = "square", - inputs = "input", - drop_rate = 0.5, -) - -Layer( - name = "pool", - type = "pool", - inputs = Input("layer1_2", - pool = Pool(pool_type="cudnn-avg-pool", - channels = 1, - size_x = 2, - size_y = 3, - img_width = 3, - padding = 1, - padding_y = 2, - stride = 2, - stride_y = 3)) -) - -Layer( - name = "concat", - type = "concat", - inputs = ["layer1_3", "layer1_4"], -) - -MixedLayer( - name = "output", - size = 3, - active_type = "softmax", - inputs = [ - FullMatrixProjection("layer1_1", - learning_rate=0.1), - TransposedFullMatrixProjection("layer1_2", parameter_name='sharew'), - FullMatrixProjection("concat"), - IdentityProjection("layer1_3"), - ], -) - -Layer( - name = "label", - type = "data", - size = 1, -) - -Layer( - name = "cost", - type = "multi-class-cross-entropy", - inputs = ["output", "label", "weight"], -) - -Layer( - name = "cost2", - type = "nce", - num_classes = 3, - active_type = "sigmoid", - neg_sampling_dist = [0.1, 0.3, 0.6], - inputs = ["layer1_2", "label", "weight"], -) - -Evaluator( - name = "error", - type = "classification_error", - inputs = ["output", "label", "weight"] -) - -Inputs("input", "label", "weight") -Outputs("cost", "cost2") - -TrainData( - ProtoData( - files = "dummy_list", - constant_slots = [1.0], - async_load_data = True, - ) -) - -TestData( - SimpleData( - files = "trainer/tests/sample_filelist.txt", - feat_dim = 3, - context_len = 0, - buffer_capacity = 1000000, - async_load_data = False, - ), -) - -Settings( - algorithm = "sgd", - num_batches_per_send_parameter = 1, - num_batches_per_get_parameter = 1, - batch_size = 100, - learning_rate = 0.001, - learning_rate_decay_a = 1e-5, - learning_rate_decay_b = 0.5, -) +from paddle.trainer_config_helpers import * + +TrainData(ProtoData( + files = "dummy_list", + constant_slots = [1.0], + async_load_data = True)) + +TestData(SimpleData( + files = "trainer/tests/sample_filelist.txt", + feat_dim = 3, + context_len = 0, + buffer_capacity = 1000000, + async_load_data = False)) + +settings(batch_size = 100) + +data = data_layer(name='input', size=3) + +wt = data_layer(name='weight', size=1) + +fc1 = fc_layer(input=data, size=5, + bias_attr=True, + act=SigmoidActivation()) + +fc2 = fc_layer(input=data, size=12, + bias_attr=True, + param_attr=ParamAttr(name='sharew'), + act=LinearActivation()) + +fc3 = fc_layer(input=data, size=3, + bias_attr=True, + act=TanhActivation()) + +fc4 = fc_layer(input=data, size=5, + bias_attr=True, + layer_attr=ExtraAttr(drop_rate=0.5), + act=SquareActivation()) + +pool = img_pool_layer(input=fc2, + pool_size=2, + pool_size_y=3, + num_channels=1, + padding=1, + padding_y=2, + stride=2, + stride_y=3, + img_width=3, + pool_type=CudnnAvgPooling()) + +concat = concat_layer(input=[fc3, fc4]) + +with mixed_layer(size=3, act=SoftmaxActivation()) as output: + output += full_matrix_projection(input=fc1) + output += trans_full_matrix_projection(input=fc2, + param_attr=ParamAttr(name='sharew')) + output += full_matrix_projection(input=concat) + output += identity_projection(input=fc3) + +lbl = data_layer(name='label', size=1) + +cost = classification_cost(input=output, label=lbl, weight=wt, + layer_attr=ExtraAttr(device=-1)) + +nce = nce_layer(input=fc2, label=lbl, weight=wt, + num_classes=3, + neg_distribution=[0.1, 0.3, 0.6]) + +outputs(cost, nce) diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore new file mode 100644 index 00000000000000..f2cfd7409412de --- /dev/null +++ b/paddle/utils/.gitignore @@ -0,0 +1 @@ +enable_virtualenv.c diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt index 0557b01e36f078..45240b5002aa18 100644 --- a/paddle/utils/CMakeLists.txt +++ b/paddle/utils/CMakeLists.txt @@ -2,6 +2,9 @@ file(GLOB UTIL_HEADERS . *.h) file(GLOB UTIL_SOURCES . *.cpp) +create_resources(enable_virtualenv.py enable_virtualenv.c) +set(UTIL_RES enable_virtualenv.c) + if(APPLE) file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp) else() @@ -9,7 +12,8 @@ else() endif() add_library(paddle_utils STATIC ${UTIL_SOURCES} - ${UTIL_ARCH_SOURCES}) + ${UTIL_ARCH_SOURCES} + ${UTIL_RES}) add_style_check_target(paddle_utils ${UTIL_HEADERS}) add_style_check_target(paddle_utils ${UTIL_SOURCES} ${UTIL_ARCH_SOURCES}) diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h index b3f439804686fa..7fdfa3240c1de7 100644 --- a/paddle/utils/Logging.h +++ b/paddle/utils/Logging.h @@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int)); } #endif // PADDLE_USE_GLOG -#ifdef NDEBUG +#ifndef NDEBUG #define DEBUG_LEVEL 5 #define DBG VLOG(DEBUG_LEVEL) #else diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp index 78c3a80674f9c1..90e5093f96ea4e 100644 --- a/paddle/utils/PythonUtil.cpp +++ b/paddle/utils/PythonUtil.cpp @@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex; PyGuard::PyGuard() : guard_(g_pyMutex) {} -static void printPyErrorStack(std::ostream& os, bool withEndl = false) { +static void printPyErrorStack(std::ostream& os, bool withEndl = false, + bool withPyPath = true) { PyObject * ptype, *pvalue, *ptraceback; PyErr_Fetch(&ptype, &pvalue, &ptraceback); PyErr_NormalizeException(&ptype, &pvalue, &ptraceback); PyErr_Clear(); + if (withPyPath) { + os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path"))); + if (withEndl) { + os << std::endl; + } + } PyTracebackObject* obj = (PyTracebackObject*)ptraceback; os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) @@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName, const std::string& funcName, const std::vector& args) { PyGuard guard; - PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str())); - CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName; - PyObjectPtr pyModule(PyImport_Import(pyModuleName.get())); - CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed."; + PyObjectPtr pyModule = py::import(moduleName); PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str())); CHECK_PY(pyFunc) << "GetAttrString failed."; PyObjectPtr pyArgs(PyTuple_New(args.size())); @@ -143,7 +147,7 @@ PyObjectPtr createPythonClass( const std::vector& args, const std::map& kwargs) { PyGuard guard; - PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str())); + PyObjectPtr pyModule = py::import(moduleName); LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str(); CHECK_PY(pyModule) << "Import module " << moduleName << " failed."; PyObjectPtr pyDict(PyModule_GetDict(pyModule.get())); @@ -181,18 +185,29 @@ std::string getPyCallStack() { printPyErrorStack(os, true); return os.str(); } + +PyObjectPtr import(const std::string &moduleName) { + auto module = PyImport_ImportModule(moduleName.c_str()); + CHECK_PY(module) << "Import " << moduleName << "Error"; + return PyObjectPtr(module); +} + } // namespace py #endif - +extern "C" { +extern const char enable_virtualenv_py[]; +} void initPython(int argc, char** argv) { #ifndef PADDLE_NO_PYTHON Py_SetProgramName(argv[0]); Py_Initialize(); PySys_SetArgv(argc, argv); - // python blocks SIGINT. Need to enable it. signal(SIGINT, SIG_DFL); + + // Manually activate virtualenv when user is using virtualenv + PyRun_SimpleString(enable_virtualenv_py); #endif } diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h index db02d1252b4057..00fc177022ac34 100644 --- a/paddle/utils/PythonUtil.h +++ b/paddle/utils/PythonUtil.h @@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName, CHECK((x) != nullptr) << ::paddle::py::getPyCallStack() namespace py { +PyObjectPtr import(const std::string& moduleName); + /** * Cast a PyLong or PyInt to int type T. * @tparam T return type. diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h index d73f27d7fafd6c..f952cf58778dee 100644 --- a/paddle/utils/Queue.h +++ b/paddle/utils/Queue.h @@ -135,6 +135,21 @@ class Queue { queueCV_.wait(lock, [this]() { return numElements_ == 0; }); } + /** + * @brief wait queue is not empty at most for some seconds. + * @param seconds wait time limit. + * @return true if queue is not empty. false if timeout. + */ + bool waitNotEmptyFor(int seconds) { + std::unique_lock lock(queueLock_); + return queueCV_.wait_for( + lock, + std::chrono::seconds(seconds), + [this] { + return numElements_ != 0; + }); + } + private: std::deque elements_; int numElements_; diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py new file mode 100644 index 00000000000000..99d822a4145cca --- /dev/null +++ b/paddle/utils/enable_virtualenv.py @@ -0,0 +1,10 @@ +import os + +def __activate_virtual_env__(): + __path__ = os.getenv('VIRTUAL_ENV') + if __path__ is None: + return + __script__ = os.path.join(__path__, 'bin', 'activate_this.py') + execfile(__script__, {'__file__': __script__}) + +__activate_virtual_env__() diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index 8bdcd70a417b84..753fd0cac42233 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -170,6 +170,15 @@ message BlockExpandConfig { required uint32 img_size_y = 11; } +message MaxOutConfig { + required uint32 channels = 1; + required uint32 groups = 2; + + // The size of input feature map. + required uint32 img_size_x = 3; + required uint32 img_size_y = 4; +} + message ProjectionConfig { required string type = 1; required string name = 2; @@ -235,6 +244,7 @@ message LayerInputConfig { // Set the argument name. optional string input_layer_argument = 9; optional BilinearInterpConfig bilinear_interp_conf = 10; + optional MaxOutConfig maxout_conf = 11; } message LayerConfig { diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index 34f5dd41b7e683..53409b746d811a 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, calc_batch_size=None, cache=CacheType.NO_CACHE, check=False, check_fail_continue=False, - use_dynamic_order=True, init_hook=None, **kwargs): """ Provider decorator. Use it to make a function into PyDataProvider2 object. @@ -228,9 +227,15 @@ def process(settings, file_name): The configuration of data provider should be setup by\: :param input_types: Specify the input types, can also be set in init_hook. - It is a list of InputType object. For example, input_types= \ - [dense_vector(9), integer_value(2)]. - :type input_types: list|tuple + It could be a list of InputType object. For example, + input_types=[dense_vector(9), integer_value(2)]. Or user + can set a dict of InputType object, which key is + data_layer's name. For example, input_types=\ + {'img': img_features, 'label': label}. when using dict of + InputType, user could yield a dict of feature values, which + key is also data_layer's name. + + :type input_types: list|tuple|dict :param should_shuffle: True if data should shuffle. Pass None means shuffle when is training and not to shuffle when is testing. @@ -281,12 +286,6 @@ def process(settings, file_name): drop the wrong format data when it is True. Has no effect when check set to False. :type check_fail_continue: bool - - :param use_dynamic_order: Allow provider to yield a dictionary object, whose - key is a input data layer name, and value is the - feature value. The tuples are still allowed when - use_dynmaic_order is True. - :type use_dynamic_order: bool """ def __wrapper__(generator): @@ -340,6 +339,11 @@ def __init__(self, file_list, **kwargs): assert self.slots is not None assert self.generator is not None + use_dynamic_order = False + if isinstance(self.slots, dict): # reorder input_types + self.slots = [self.slots[ipt] for ipt in self.input_order] + use_dynamic_order = True + if len(self.slots) == 1: self.generator = SingleSlotWrapper(self.generator) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 82446e980d81cc..c6cd4f62b91c9a 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -216,6 +216,10 @@ def Inputs(*args): if g_current_submodel is g_root_submodel: g_config.model_config.input_layer_names.append(name) +@config_func +def HasInputsSet(): + return len(g_config.model_config.input_layer_names) != 0 + # Define the name of the output layers of the NeuralNetwork. # Usually the output is simply the cost layer. @@ -466,6 +470,7 @@ def __init__( pool=None, image=None, block_expand=None, + maxout=None, format=None, nnz=None, is_static=None, @@ -794,6 +799,16 @@ def __init__( output_y = 0): self.add_keys(locals()) +@config_class +class MaxOut(Cfg): + def __init__( + self, + channels, + groups, + img_size_x = 0, + img_size_y = 0): + self.add_keys(locals()) + def DataBase(async_load_data=False, constant_slots=None, data_ratio=1, @@ -1098,6 +1113,12 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf): int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \ - block_expand.block_y) / float(block_expand.stride_y))) +def parse_maxout(maxout, input_layer_name, maxout_conf): + maxout_conf.channels = maxout.channels + maxout_conf.groups = maxout.groups + maxout_conf.img_size_x = maxout.img_size_x + maxout_conf.img_size_y = maxout.img_size_y + # Define an evaluator @config_func def Evaluator( @@ -1721,6 +1742,21 @@ def __init__( self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y * block_expand_conf.channels) +@config_layer('maxout') +class MaxOutLayer(LayerBase): + def __init__( + self, + name, + inputs, + **xargs): + super(MaxOutLayer, self).__init__(name, 'maxout', 0, inputs=inputs, **xargs) + input_layer = self.get_input_layer(0) + parse_maxout(self.inputs[0].maxout, + input_layer.name, + self.config.inputs[0].maxout_conf) + maxout_conf = self.config.inputs[0].maxout_conf + self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups) + # key: cost type # value: cost class g_cost_map = {} @@ -1735,7 +1771,6 @@ def init(cls, name, inputs, device=None, coeff=1.): g_cost_map[cost_type] = cls define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy') -define_cost('ClassificationErrorLayer', 'classification_error') define_cost('RankingCost', 'rank-cost') define_cost('AucValidation', 'auc-validation') define_cost('PnpairValidation', 'pnpair-validation') diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py index 8ada3903dc06be..f51140656d0dcf 100644 --- a/python/paddle/trainer_config_helpers/data_sources.py +++ b/python/paddle/trainer_config_helpers/data_sources.py @@ -68,7 +68,7 @@ def define_py_data_source(file_list, cls, module, file_list_name = 'train.list' if isinstance(cls, TestData): file_list_name = 'test.list' - with open(file_list_name, 'r') as f: + with open(file_list_name, 'w') as f: f.writelines(file_list) file_list = file_list_name @@ -84,6 +84,7 @@ def py_data2(files, load_data_module, load_data_object, load_data_args, data.load_data_module = load_data_module data.load_data_object = load_data_object data.load_data_args = load_data_args + data.async_load_data = True return data data_cls = py_data2 diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 59df4646faae98..8d249b140e8cde 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -50,11 +50,12 @@ 'slope_intercept_layer', 'trans_full_matrix_projection', 'linear_comb_layer', 'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer', + 'nce_layer', 'cross_entropy_with_selfnorm', 'cross_entropy', 'multi_binary_label_cross_entropy', 'rank_cost', 'lambda_cost', 'huber_cost', # 'block_expand_layer', # TODO(yuyang18): this layer is not correct - 'out_prod_layer', 'print_layer' + 'maxout_layer', 'out_prod_layer', 'print_layer' ] @@ -110,12 +111,14 @@ class LayerType(object): SLOPE_INTERCEPT_LAYER = "slope_intercept" LINEAR_COMBINATION_LAYER = "convex_comb" BLOCK_EXPAND = "blockexpand" + MAXOUT = "maxout" PRINT_LAYER = "print" CTC_LAYER = "ctc" CRF_LAYER = "crf" CRF_DECODING_LAYER = "crf_decoding" + NCE_LAYER = 'nce' RANK_COST = "rank-cost" LAMBDA_COST = "lambda_cost" @@ -169,7 +172,7 @@ class LayerOutput(object): :param activation: Layer Activation. :type activation: BaseActivation. :param parents: Layer's parents. - :type parents: list|tuple|collection.Sequence + :type parents: list|tuple|collections.Sequence """ def __init__(self, name, layer_type, parents=None, activation=None, @@ -1692,7 +1695,7 @@ def img_conv_layer(input, filter_size, num_filters, @layer_support() def img_pool_layer(input, pool_size, name=None, num_channels=None, pool_type=None, - stride=1, start=None, padding=0, layer_attr=None, + stride=1, padding=0, layer_attr=None, pool_size_y=None, stride_y=None, padding_y=None, img_width=None): """ @@ -1723,8 +1726,6 @@ def img_pool_layer(input, pool_size, name=None, :type stride: int :param stride_y: stride height of pooling. It is equal to stride by default. :type stride_y: int|None - :param start: start position of pooling operation. Note it is deprecated now. - :type start: int|None :param layer_attr: Extra Layer attribute. :type layer_attr: ExtraLayerAttribute :param img_width: the width of input feature map. If it is None, the input feature @@ -1758,7 +1759,7 @@ def img_pool_layer(input, pool_size, name=None, pool_type=type_name, channels=num_channels, size_x=pool_size, - start=start, + start=None, stride=stride, padding=padding, size_y=pool_size_y, @@ -2053,10 +2054,16 @@ def concat_layer(input, act=None, name=None, layer_attr=None): Concat all input vector into one huge vector. Inputs can be list of LayerOutput or list of projection. + The example usage is: + + .. code-block:: python + + concat = concat_layer(input=[layer1, layer2]) + :param name: Layer name. :type name: basestring :param input: input layers or projections - :type input: list|tuple|collection.Sequence + :type input: list|tuple|collections.Sequence :param act: Activation type. :type act: BaseActivation :param layer_attr: Extra Layer Attribute. @@ -2842,30 +2849,52 @@ def __real_step__(*args): return tmp +def __cost_input__(input, label, weight=None): + """ + inputs and parents for cost layers. + """ + ipts = [Input(input.name), Input(label.name)] + parents = [input, label] + if weight is not None: + assert weight.layer_type == LayerType.DATA + ipts.append(Input(weight.name)) + parents.append(weight) + return ipts, parents + @wrap_name_default() -def regression_cost(input, label, cost='square_error', name=None): +@layer_support() +def regression_cost(input, label, weight=None, name=None, + layer_attr=None): """ Regression Layer. TODO(yuyang18): Complete this method. :param name: layer name. + :type name: basestring :param input: Network prediction. + :type input: LayerOutput :param label: Data label. - :param cost: Cost method. + :type label: LayerOutput + :param weight: The weight affects the cost, namely the scale of cost. + It is an optional argument. + :type weight: LayerOutput + :param layer_attr: layer's extra attribute. + :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. + :rtype: LayerOutput """ - Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name) - return LayerOutput( - name, LayerType.COST, parents=[input, label] - ) + ipts, parents = __cost_input__(input, label, weight) + + Layer(inputs=ipts, type="square_error", name=name, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput(name, LayerType.COST, parents=parents) @wrap_name_default("cost") @layer_support() -def classification_cost(input, label, name=None, - cost="multi-class-cross-entropy", +def classification_cost(input, label, weight=None, name=None, evaluator=classification_error_evaluator, layer_attr=None): """ @@ -2877,8 +2906,9 @@ def classification_cost(input, label, name=None, :type input: LayerOutput :param label: label layer name. data_layer often. :type label: LayerOutput - :param cost: cost method. - :type cost: basestring + :param weight: The weight affects the cost, namely the scale of cost. + It is an optional argument. + :type weight: LayerOutput :param evaluator: Evaluator method. :param layer_attr: layer's extra attribute. :type layer_attr: ExtraLayerAttribute @@ -2888,7 +2918,10 @@ def classification_cost(input, label, name=None, assert input.layer_type != LayerType.DATA assert isinstance(input.activation, SoftmaxActivation) assert label.layer_type == LayerType.DATA - Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)], + + ipts, parents = __cost_input__(input, label, weight) + + Layer(name=name, type="multi-class-cross-entropy", inputs=ipts, **ExtraLayerAttribute.to_kwargs(layer_attr)) def __add_evaluator__(e): @@ -2900,7 +2933,7 @@ def __add_evaluator__(e): assert isinstance(e.for_classification, bool) assert e.for_classification - e(name=e.__name__, input=input, label=label) + e(name=e.__name__, input=input, label=label, weight=weight) if not isinstance(evaluator, collections.Sequence): evaluator = [evaluator] @@ -2908,7 +2941,7 @@ def __add_evaluator__(e): for each_evaluator in evaluator: __add_evaluator__(each_evaluator) - return LayerOutput(name, LayerType.COST, parents=[input, label]) + return LayerOutput(name, LayerType.COST, parents=parents) def conv_operator(img, filter, filter_size, num_filters, @@ -2984,7 +3017,8 @@ def conv_operator(img, filter, filter_size, num_filters, @wrap_name_default() -def conv_shift_layer(a, b, name=None): +@layer_support() +def conv_shift_layer(a, b, name=None, layer_attr=None): """ This layer performs cyclic convolution for two input. For example: - a[in]: contains M elements. @@ -3013,6 +3047,8 @@ def conv_shift_layer(a, b, name=None): :type a: LayerOutput :param b: input layer b :type b: LayerOutput + :param layer_attr: layer's extra attribute. + :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput """ @@ -3022,6 +3058,7 @@ def conv_shift_layer(a, b, name=None): name=name, type=LayerType.CONV_SHIFT_LAYER, inputs=[a.name, b.name], + **ExtraLayerAttribute.to_kwargs(layer_attr) ) return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b], @@ -3095,6 +3132,7 @@ def tensor_layer(a, b, size, act=None, name=None, @wrap_param_attr_default() @wrap_bias_attr_default() @wrap_act_default() +@layer_support() def selective_fc_layer(input, select, size, act=None, name=None, pass_generation=False, has_selected_colums=True, @@ -3167,7 +3205,8 @@ def selective_fc_layer(input, select, size, act=None, name=None, @wrap_name_default() -def sampling_id_layer(input, name=None): +@layer_support() +def sampling_id_layer(input, name=None, layer_attr=None): """ A layer for sampling id from multinomial distribution from the input layer. Sampling one id for one sample. @@ -3182,6 +3221,8 @@ def sampling_id_layer(input, name=None): :type input: LayerOutput :param name: The Layer Name. :type name: basestring + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3189,12 +3230,15 @@ def sampling_id_layer(input, name=None): name=name, type=LayerType.SAMPLING_ID_LAYER, inputs=[Input(input.name)], + **ExtraLayerAttribute.to_kwargs(layer_attr) ) return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input) @wrap_name_default() -def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0): +@layer_support() +def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0, + layer_attr=None): """ This layer for applying a slope and an intercept to the input element-wise. There is no activation and weight. @@ -3216,6 +3260,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0): :type slope: float. :param intercept: the offset. :type intercept: float. + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3225,12 +3271,15 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0): slope=slope, intercept=intercept, inputs=[Input(input.name)], + **ExtraLayerAttribute.to_kwargs(layer_attr) ) return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input) @wrap_name_default() -def linear_comb_layer(weights, vectors, size=None, name=None): +@layer_support() +def linear_comb_layer(weights, vectors, size=None, name=None, + layer_attr=None): """ A layer for weighted sum of vectors takes two inputs. - Input: size of weights is M @@ -3271,6 +3320,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None): :type size: int :param name: The Layer Name. :type name: basestring + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3286,6 +3337,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None): type=LayerType.LINEAR_COMBINATION_LAYER, size=size, inputs=[Input(weights.name), Input(vectors.name)], + **ExtraLayerAttribute.to_kwargs(layer_attr) ) return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER, [weights, vectors], size=size) @@ -3295,6 +3347,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None): @wrap_name_default() +@layer_support() def block_expand_layer(input, channel=0, block_x=0, @@ -3303,7 +3356,8 @@ def block_expand_layer(input, stride_y=0, padding_x=0, padding_y=0, - name=None): + name=None, + layer_attr=None): """ Expand feature map to minibatch matrix. - matrix width is: block_y * block_x * channel @@ -3350,6 +3404,8 @@ def block_expand_layer(input, :type padding_y: int :param name: The name of this layer, which can not specify. :type name: None|basestring. + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3364,13 +3420,83 @@ def block_expand_layer(input, padding_y=padding_y) ), type=LayerType.BLOCK_EXPAND, + **ExtraLayerAttribute.to_kwargs(layer_attr) ) return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input]) @wrap_name_default() -def ctc_layer(input, label, size=None, name=None, norm_by_times=False): +@layer_support() +def maxout_layer(input, + groups, + num_channels=None, + size_x=None, + size_y=None, + name=None, + layer_attr=None): + """ + A layer to do max out on conv layer output. + - Input: output of a conv layer. + - Output: feature map size same as input. Channel is (input channel) / groups. + + So groups should be larger than 1, and the num of channels should be able + to devided by groups. + + Please refer to Paper: + - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf + - Multi-digit Number Recognition from Street View \ + Imagery using Deep Convolutional Neural Networks: \ + https://arxiv.org/pdf/1312.6082v4.pdf + + The simple usage is: + + .. code-block:: python + + maxout = maxout_layer(input, + num_channels=128, + groups=4) + + :param input: The input layer. + :type input: LayerOutput + :param num_channels: The channel number of input layer. If None will be set + automatically from previous output. + :type num_channels: int|None + :param groups: The group number of input layer. + :type groups: int + :param size_x: conv output width. If None will be set + automatically from previous output. + :type size_x: int|None + :param size_y: conv output height. If None will be set + automatically from previous output. + :type size_y: int|None + :param name: The name of this layer, which can not specify. + :type name: None|basestring. + :param layer_attr: Extra Layer attribute. + :type layer_attr: ExtraLayerAttribute + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert input.layer_type == LayerType.CONV_LAYER + assert isinstance(input.activation, LinearActivation) + assert groups > 1 + if num_channels is None: + assert input.num_filters is not None + num_channels = input.num_filters + assert num_channels % groups == 0 + Layer(name=name, + inputs=Input(input.name, + maxout=MaxOut(channels=num_channels, + groups=groups)), + type=LayerType.MAXOUT, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput(name, LayerType.MAXOUT, parents=[input]) + + +@wrap_name_default() +@layer_support() +def ctc_layer(input, label, size=None, name=None, norm_by_times=False, + layer_attr=None): """ Connectionist Temporal Classification (CTC) is designed for temporal classication task. That is, for sequence labeling problems where the @@ -3407,6 +3533,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False): :type name: basestring|None :param norm_by_times: Whether to normalization by times. False by default. :type norm_by_times: bool + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3422,14 +3550,17 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False): type=LayerType.CTC_LAYER, size=size, norm_by_times=norm_by_times, - inputs=[input.name, label.name] + inputs=[input.name, label.name], + **ExtraLayerAttribute.to_kwargs(layer_attr) ) return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size) @wrap_name_default() @wrap_param_attr_default() -def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): +@layer_support() +def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None, + layer_attr=None): """ A layer for calculating the cost of sequential conditional random field model. @@ -3455,6 +3586,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): :type param_attr: ParameterAttribute :param name: The name of this layers. It is not necessary. :type name: None|basestring + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3478,6 +3611,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): type=LayerType.CRF_LAYER, size=size, inputs=ipts, + **ExtraLayerAttribute.to_kwargs(layer_attr) ) parents = [input, label] if weight is not None: @@ -3487,7 +3621,9 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): @wrap_name_default() @wrap_param_attr_default() -def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): +@layer_support() +def crf_decoding_layer(input, size, label=None, param_attr=None, name=None, + layer_attr=None): """ A layer for calculating the decoding sequence of sequential conditional random field model. The decoding sequence is stored in output.ids. @@ -3505,6 +3641,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): :type param_attr: ParameterAttribute :param name: The name of this layers. It is not necessary. :type name: None|basestring + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None :return: LayerOutput object. :rtype: LayerOutput """ @@ -3521,12 +3659,90 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): type=LayerType.CRF_DECODING_LAYER, size=size, inputs=ipts, + **ExtraLayerAttribute.to_kwargs(layer_attr) ) parents = [input] if label is not None: parents.append(label) return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size) +@wrap_bias_attr_default(has_bias=True) +@wrap_name_default() +@layer_support() +def nce_layer(input, label, num_classes, weight=None, + num_neg_samples=10, neg_distribution=None, + name=None, bias_attr=None, layer_attr=None): + """ + Noise-contrastive estimation. + Implements the method in the following paper: + A fast and simple algorithm for training neural probabilistic language models. + + The example usage is: + + .. code-block:: python + + cost = nce_layer(input=layer1, label=layer2, weight=layer3, + num_classes=3, neg_distribution=[0.1,0.3,0.6]) + + :param name: layer name + :type name: basestring + :param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput. + :type input: LayerOutput|list|tuple|collections.Sequence + :param label: label layer + :type label: LayerOutput + :param weight: weight layer, can be None(default) + :type weight: LayerOutput + :param num_classes: number of classes. + :type num_classes: int + :param num_neg_samples: number of negative samples. Default is 10. + :type num_neg_samples: int + :param neg_distribution: The distribution for generating the random negative labels. + A uniform distribution will be used if not provided. + If not None, its length must be equal to num_classes. + :type neg_distribution: list|tuple|collections.Sequence|None + :param bias_attr: Bias parameter attribute. True if no bias. + :type bias_attr: ParameterAttribute|None|False + :param layer_attr: Extra Layer Attribute. + :type layer_attr: ExtraLayerAttribute + :return: layer name. + :rtype: LayerOutput + """ + if isinstance(input, LayerOutput): + input = [input] + assert isinstance(input, collections.Sequence) + assert isinstance(label, LayerOutput) + assert label.layer_type == LayerType.DATA + if neg_distribution is not None: + assert isinstance(neg_distribution, collections.Sequence) + assert len(neg_distribution) == num_classes + assert sum(neg_distribution) == 1 + + ipts_for_layer = [] + parents = [] + for each_input in input: + assert isinstance(each_input, LayerOutput) + ipts_for_layer.append(each_input.name) + parents.append(each_input) + ipts_for_layer.append(label.name) + parents.append(label) + + if weight is not None: + assert isinstance(weight, LayerOutput) + assert weight.layer_type == LayerType.DATA + ipts_for_layer.append(weight.name) + parents.append(weight) + + Layer( + name=name, + type=LayerType.NCE_LAYER, + num_classes=num_classes, + neg_sampling_dist=neg_distribution, + num_neg_samples=num_neg_samples, + inputs=ipts_for_layer, + bias=ParamAttr.to_bias(bias_attr), + **ExtraLayerAttribute.to_kwargs(layer_attr) + ) + return LayerOutput(name, LayerType.NCE_LAYER, parents=parents) """ following are cost Layers. @@ -3534,7 +3750,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): @wrap_name_default() -def rank_cost(left, right, label, weight=None, name=None, coeff=1.0): +@layer_support() +def rank_cost(left, right, label, weight=None, name=None, coeff=1.0, layer_attr=None): """ A cost Layer for learning to rank using gradient descent. Details can refer to `papers 0 + + if HasInputsSet(): # input already set + Outputs(*[l.name for l in layers]) + return # just return outputs. + if len(layers) != 1: logger.warning("`outputs` routine try to calculate network's" " inputs and outputs order. It might not work well." diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index 4660a6b5003daf..d4b947517b7d04 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -362,6 +362,13 @@ def __extends__(dict1, dict2): default_factory=lambda _: BaseRegularization()) def settings(batch_size, learning_rate=1e-3, + learning_rate_decay_a=0., + learning_rate_decay_b=0., + learning_rate_schedule='poly', + learning_rate_args='', + average_window=0, + do_average_in_cpu=False, + max_average_window=None, learning_method=None, regularization=None, is_async=False, @@ -408,10 +415,14 @@ def settings(batch_size, else: algorithm = 'owlqn' + args=['batch_size', 'learning_rate', 'learning_rate_decay_a', + 'learning_rate_decay_b', 'learning_rate_schedule', + 'learning_rate_args', 'average_window', 'do_average_in_cpu', + 'max_average_window'] kwargs = dict() - kwargs['batch_size'] = batch_size - kwargs['learning_rate'] = learning_rate kwargs['algorithm'] = algorithm + for arg in args: + kwargs[arg] = locals()[arg] kwargs = __extends__(kwargs, learning_method.to_setting_kwargs()) learning_method.extra_settings() diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5 index 359652f3d09c7f..88ce5c129e552e 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/check.md5 +++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5 @@ -2,13 +2,17 @@ a5d9259ff1fd7ca23d0ef090052cb1f2 last_first_seq.protostr 9c038249ec8ff719753a746cdb04c026 layer_activations.protostr 5913f87b39cee3b2701fa158270aca26 projections.protostr +7334ba0a4544f0623231330fc51d390d shared_fc.protostr +8b8b6bb128a7dfcc937be86145f53e2f shared_lstm.protostr 6b39e34beea8dfb782bee9bd3dea9eb5 simple_rnn_layers.protostr 0fc1409600f1a3301da994ab9d28b0bf test_cost_layers.protostr +6cd5f28a3416344f20120698470e0a4c test_cost_layers_with_weight.protostr 144bc6d3a509de74115fa623741797ed test_expand_layer.protostr 2378518bdb71e8c6e888b1842923df58 test_fc.protostr 8bb44e1e5072d0c261572307e7672bda test_grumemory_layer.protostr 1f3510672dce7a9ed25317fc58579ac7 test_hsigmoid.protostr d350bd91a0dc13e854b1364c3d9339c6 test_lstmemory_layer.protostr +6fa59551808ee7012bbd24f757e782d2 test_maxout.protostr 251a948ba41c1071afcd3d9cf9c233f7 test_ntm_layers.protostr e6ff04e70aea27c7b06d808cc49c9497 test_print_layer.protostr 2a75dd33b640c49a8821c2da6e574577 test_rnn_group.protostr diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh index e8be0023e70134..15c66a9754604c 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh @@ -8,8 +8,8 @@ configs=(test_fc layer_activations projections test_print_layer test_sequence_pooling test_lstmemory_layer test_grumemory_layer last_first_seq test_expand_layer test_ntm_layers test_hsigmoid img_layers util_layers simple_rnn_layers unused_layers test_cost_layers -test_rnn_group test_bilinear_interp) - +test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight +test_bilinear_interp test_maxout) for conf in ${configs[*]} do diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py new file mode 100644 index 00000000000000..202cf367fc7f28 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py @@ -0,0 +1,22 @@ +from paddle.trainer_config_helpers import * + +settings( + learning_rate=1e-4, + batch_size=1000 +) + +a = data_layer(name='feature_a', size=200) +b = data_layer(name='feature_b', size=200) + +fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0) +bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0) + +softmax_param = ParamAttr(name='softmax_param', initial_max=1.0, initial_min=-1.0) + +hidden_a = fc_layer(input=a, size=200, param_attr=fc_param, bias_attr=bias_param) +hidden_b = fc_layer(input=b, size=200, param_attr=fc_param, bias_attr=bias_param) + +predict = fc_layer(input=[hidden_a, hidden_b], param_attr=[softmax_param, softmax_param], + bias_attr=False, size=10, act=SoftmaxActivation()) + +outputs(classification_cost(input=predict, label=data_layer(name='label', size=10))) diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py new file mode 100644 index 00000000000000..8557e9daaf66ad --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py @@ -0,0 +1,29 @@ +from paddle.trainer_config_helpers import * + +settings(learning_rate=1e-4, batch_size=1000) + +data_1 = data_layer(name='data_a', size=100) +data_2 = data_layer(name='data_b', size=100) + +mixed_param = ParamAttr(name='mixed_param') + +with mixed_layer(size=400, bias_attr=False) as m1: + m1 += full_matrix_projection(input=data_1, param_attr=mixed_param) + +with mixed_layer(size=400, bias_attr=False) as m2: + m2 += full_matrix_projection(input=data_2, param_attr=mixed_param) + +lstm_param = ParamAttr(name='lstm_param') +lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.) + +lstm1 = lstmemory_group(input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False) +lstm2 = lstmemory_group(input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False) + +softmax_param = ParamAttr(name='softmax_param') + +predict = fc_layer(input=[last_seq(input=lstm1), last_seq(input=lstm2)], + size=10, + param_attr=[softmax_param, softmax_param], + bias_attr=False, + act=SoftmaxActivation()) +outputs(classification_cost(input=predict, label=data_layer(name='label', size=10))) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py new file mode 100644 index 00000000000000..29749cbb666379 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py @@ -0,0 +1,14 @@ +from paddle.trainer_config_helpers import * + +settings( + learning_rate=1e-4, + batch_size=1000 +) + +data = data_layer(name='input', size=300) +lbl = data_layer(name='label', size=1) +wt = data_layer(name='weight', size=1) +fc = fc_layer(input=data, size=10, act=SoftmaxActivation()) + +outputs(classification_cost(input=fc, label=lbl, weight=wt), + regression_cost(input=fc, label=lbl, weight=wt)) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py new file mode 100644 index 00000000000000..079e2cf4c43206 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py @@ -0,0 +1,30 @@ +from paddle.trainer_config_helpers import * + +settings( + batch_size=1000, + learning_rate=1e-5 +) + +data = data_layer(name='data', size=2304) + +conv = img_conv_layer(input=data, + filter_size = 3, + num_channels=1, + num_filters=16, + padding=1, + act=LinearActivation(), + bias_attr=True) + +maxout = maxout_layer(input=conv, + num_channels=16, + groups=2) + +pool = img_pool_layer(input=maxout, + num_channels=8, + pool_size=2, + stride=2, + pool_type=MaxPooling()) + +fc = fc_layer(input=pool, size=384, bias_attr=False) + +outputs(fc)