Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… fit_a_line
  • Loading branch information
qingqing01 committed Mar 2, 2017
2 parents 80c9f66 + 061e743 commit 62ff19e
Show file tree
Hide file tree
Showing 17 changed files with 775 additions and 19 deletions.
1 change: 1 addition & 0 deletions python/paddle/v2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import data_type
import topology
import data_feeder
import networks
from . import dataset
from . import reader
import attr
Expand Down
33 changes: 28 additions & 5 deletions python/paddle/v2/config_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class Layer(object):
def __init__(self, name=None, parent_layers=None):
assert isinstance(parent_layers, dict)
self.name = name
self.__contex__ = {}
self.__parent_layers__ = parent_layers

def to_proto(self, context):
Expand All @@ -39,16 +40,38 @@ def to_proto(self, context):
self.__parent_layers__[layer_name])
kwargs[layer_name] = v1_layer

if self.name is None:
if self.context_name() is None:
return self.to_proto_impl(**kwargs)
elif self.name not in context:
context[self.name] = self.to_proto_impl(**kwargs)

return context[self.name]
elif self.context_name() not in context:
context[self.context_name()] = self.to_proto_impl(**kwargs)
self.__contex__ = context
if self.use_context_name():
return context[self.context_name()]
else:
return context[self.name]

def to_proto_impl(self, **kwargs):
raise NotImplementedError()

def context_name(self):
"""
Context name means the context which stores `to_proto_impl` result.
If multiple layer share same context_name, the `to_proto_impl` of them
will be invoked only once.
"""
return self.name

def use_context_name(self):
return False

def calculate_size(self):
"""
lazy calculate size of the layer, should be called when to_proto_impl of
this layer is called.
:return:
"""
return self.__contex__[self.context_name()].size


def __convert_to_v2__(method_name, parent_names, is_default_name=True):
if is_default_name:
Expand Down
14 changes: 14 additions & 0 deletions python/paddle/v2/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import mnist
import imikolov
import imdb
Expand Down
14 changes: 14 additions & 0 deletions python/paddle/v2/dataset/cifar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
"""

import cPickle
import itertools
import numpy
Expand Down
14 changes: 14 additions & 0 deletions python/paddle/v2/dataset/common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import requests
import hashlib
import os
Expand Down
205 changes: 205 additions & 0 deletions python/paddle/v2/dataset/conll05.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle.v2.dataset.common
import tarfile
import gzip
import itertools

__all__ = ['test, get_dict', 'get_embedding']
"""
Conll 2005 dataset. Paddle semantic role labeling Book and demo use this
dataset as an example. Because Conll 2005 is not free in public, the default
downloaded URL is test set of Conll 2005 (which is public). Users can change
URL and MD5 to their Conll dataset.
"""

DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
DATA_MD5 = '387719152ae52d60422c016e92a742fc'
WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'

UNK_IDX = 0


def load_dict(filename):
d = dict()
with open(filename, 'r') as f:
for i, line in enumerate(f):
d[line.strip()] = i
return d


def corpus_reader(data_path, words_name, props_name):
"""
Read one corpus. It returns an iterator. Each element of
this iterator is a tuple including sentence and labels. The sentence is
consist of a list of word IDs. The labels include a list of label IDs.
:return: a iterator of data.
:rtype: iterator
"""

def reader():
tf = tarfile.open(data_path)
wf = tf.extractfile(words_name)
pf = tf.extractfile(props_name)
with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
fileobj=pf) as props_file:
sentences = []
labels = []
one_seg = []
for word, label in itertools.izip(words_file, props_file):
word = word.strip()
label = label.strip().split()

if len(label) == 0: # end of sentence
for i in xrange(len(one_seg[0])):
a_kind_lable = [x[i] for x in one_seg]
labels.append(a_kind_lable)

if len(labels) >= 1:
verb_list = []
for x in labels[0]:
if x != '-':
verb_list.append(x)

for i, lbl in enumerate(labels[1:]):
cur_tag = 'O'
is_in_bracket = False
lbl_seq = []
verb_word = ''
for l in lbl:
if l == '*' and is_in_bracket == False:
lbl_seq.append('O')
elif l == '*' and is_in_bracket == True:
lbl_seq.append('I-' + cur_tag)
elif l == '*)':
lbl_seq.append('I-' + cur_tag)
is_in_bracket = False
elif l.find('(') != -1 and l.find(')') != -1:
cur_tag = l[1:l.find('*')]
lbl_seq.append('B-' + cur_tag)
is_in_bracket = False
elif l.find('(') != -1 and l.find(')') == -1:
cur_tag = l[1:l.find('*')]
lbl_seq.append('B-' + cur_tag)
is_in_bracket = True
else:
raise RuntimeError('Unexpected label: %s' %
l)

yield sentences, verb_list[i], lbl_seq

sentences = []
labels = []
one_seg = []
else:
sentences.append(word)
one_seg.append(label)

pf.close()
wf.close()
tf.close()

return reader


def reader_creator(corpus_reader,
word_dict=None,
predicate_dict=None,
label_dict=None):
def reader():
for sentence, predicate, labels in corpus_reader():

sen_len = len(sentence)

verb_index = labels.index('B-V')
mark = [0] * len(labels)
if verb_index > 0:
mark[verb_index - 1] = 1
ctx_n1 = sentence[verb_index - 1]
else:
ctx_n1 = 'bos'

if verb_index > 1:
mark[verb_index - 2] = 1
ctx_n2 = sentence[verb_index - 2]
else:
ctx_n2 = 'bos'

mark[verb_index] = 1
ctx_0 = sentence[verb_index]

if verb_index < len(labels) - 1:
mark[verb_index + 1] = 1
ctx_p1 = sentence[verb_index + 1]
else:
ctx_p1 = 'eos'

if verb_index < len(labels) - 2:
mark[verb_index + 2] = 1
ctx_p2 = sentence[verb_index + 2]
else:
ctx_p2 = 'eos'

word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
pred_idx = [predicate_dict.get(predicate)] * sen_len

ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len

label_idx = [label_dict.get(w) for w in labels]

yield word_idx, pred_idx, ctx_n2_idx, ctx_n1_idx, \
ctx_0_idx, ctx_p1_idx, ctx_p2_idx, mark, label_idx

return reader()


def get_dict():
word_dict = load_dict(
common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
verb_dict = load_dict(
common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
label_dict = load_dict(
common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
return word_dict, verb_dict, label_dict


def get_embedding():
return common.download(EMB_URL, 'conll05st', EMB_MD5)


def test():
word_dict, verb_dict, label_dict = get_dict()
reader = corpus_reader(
common.download(DATA_URL, 'conll05st', DATA_MD5),
words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
return reader_creator(reader, word_dict, verb_dict, label_dict)


if __name__ == '__main__':
print get_embedding()
for f in test():
print f
4 changes: 1 addition & 3 deletions python/paddle/v2/dataset/imdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# /usr/bin/env python
# -*- coding:utf-8 -*-

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -17,6 +14,7 @@
"""
IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
"""

import paddle.v2.dataset.common
import tarfile
import Queue
Expand Down
13 changes: 13 additions & 0 deletions python/paddle/v2/dataset/imikolov.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
"""
Expand Down
13 changes: 13 additions & 0 deletions python/paddle/v2/dataset/mnist.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MNIST dataset.
"""
Expand Down
14 changes: 14 additions & 0 deletions python/paddle/v2/dataset/movielens.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import zipfile
from common import download
import re
Expand Down
Loading

0 comments on commit 62ff19e

Please sign in to comment.