Skip to content

Commit

Permalink
Further style fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
fchollet committed Jan 12, 2017
1 parent 538d368 commit cfa1f7c
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 29 deletions.
47 changes: 23 additions & 24 deletions keras/preprocessing/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
"""These preprocessing utilities would greatly benefit
from a fast Cython rewrite.
"""Utilities for text input preprocessing.
May benefit from a fast Cython rewrite.
"""
from __future__ import absolute_import
from __future__ import division
Expand All @@ -17,14 +18,9 @@
maketrans = str.maketrans


def base_filter():
f = string.punctuation
f = f.replace("'", '')
f += '\t\n'
return f


def text_to_word_sequence(text, filters=base_filter(), lower=True, split=" "):
def text_to_word_sequence(text,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=" "):
"""Converts a text to a sequence of word indices.
# Arguments
Expand All @@ -40,7 +36,7 @@ def text_to_word_sequence(text, filters=base_filter(), lower=True, split=" "):
text = text.lower()
text = text.translate(maketrans(filters, split * len(filters)))
seq = text.split(split)
return [_f for _f in seq if _f]
return [i for i in seq if i]


def one_hot(text, n, filters=base_filter(), lower=True, split=" "):
Expand Down Expand Up @@ -78,8 +74,11 @@ class Tokenizer(object):
`0` is a reserved index that won't be assigned to any word.
"""

def __init__(self, nb_words=None, filters=base_filter(),
lower=True, split=' ', char_level=False):
def __init__(self, nb_words=None,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=' ',
char_level=False):
self.word_counts = {}
self.word_docs = {}
self.filters = filters
Expand Down Expand Up @@ -165,6 +164,7 @@ def texts_to_sequences(self, texts):

def texts_to_sequences_generator(self, texts):
"""Transforms each text in texts in a sequence of integers.
Only top "nb_words" most frequent words will be taken into account.
Only words known by the tokenizer will be taken into account.
Expand All @@ -191,8 +191,7 @@ def texts_to_sequences_generator(self, texts):
yield vect

def texts_to_matrix(self, texts, mode='binary'):
"""Convert a list of texts to a Numpy matrix,
according to some vectorization mode.
"""Convert a list of texts to a Numpy matrix.
# Arguments
texts: list of strings.
Expand All @@ -205,8 +204,7 @@ def texts_to_matrix(self, texts, mode='binary'):
return self.sequences_to_matrix(sequences, mode=mode)

def sequences_to_matrix(self, sequences, mode='binary'):
"""Converts a list of sequences into a Numpy matrix,
according to some vectorization mode.
"""Converts a list of sequences into a Numpy matrix.
# Arguments
sequences: list of sequences
Expand All @@ -229,7 +227,7 @@ def sequences_to_matrix(self, sequences, mode='binary'):
raise ValueError('Fit the Tokenizer on some data '
'before using tfidf mode.')

X = np.zeros((len(sequences), nb_words))
x = np.zeros((len(sequences), nb_words))
for i, seq in enumerate(sequences):
if not seq:
continue
Expand All @@ -243,17 +241,18 @@ def sequences_to_matrix(self, sequences, mode='binary'):
counts[j] += 1
for j, c in list(counts.items()):
if mode == 'count':
X[i][j] = c
x[i][j] = c
elif mode == 'freq':
X[i][j] = c / len(seq)
x[i][j] = c / len(seq)
elif mode == 'binary':
X[i][j] = 1
x[i][j] = 1
elif mode == 'tfidf':
# Use weighting scheme 2 in
# https://en.wikipedia.org/wiki/Tf%E2%80%93idf
tf = 1 + np.log(c)
idf = np.log(1 + self.document_count / (1 + self.index_docs.get(j, 0)))
X[i][j] = tf * idf
idf = np.log(1 + self.document_count /
(1 + self.index_docs.get(j, 0)))
x[i][j] = tf * idf
else:
raise ValueError('Unknown vectorization mode:', mode)
return X
return x
2 changes: 1 addition & 1 deletion keras/utils/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
tables = None


class HDF5Matrix():
class HDF5Matrix(object):
"""Representation of HDF5 dataset to be used instead of a Numpy array.
# Example
Expand Down
6 changes: 4 additions & 2 deletions keras/utils/layer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ def layer_from_config(config, custom_objects=None):


def print_summary(layers, relevant_nodes=None,
line_length=100, positions=[.33, .55, .67, 1.]):
line_length=100, positions=None):
"""Prints a summary of a layer.
# Arguments
layers: list of layers to print summaries of
relevant_nodes: list of relevant nodes
line_length: total length of printed lines
positions: relative or absolute positions of log elements in each line
positions: relative or absolute positions of log elements in each line.
If not provided, defaults to `[.33, .55, .67, 1.]`.
"""
positions = positions or [.33, .55, .67, 1.]
if positions[-1] <= 1:
positions = [int(line_length * p) for p in positions]
# header names for the different log elements
Expand Down
4 changes: 2 additions & 2 deletions keras/utils/np_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def convert_kernel(kernel, dim_ordering=None):
Also works reciprocally, since the transformation is its own inverse.
# Arguments
kerne: Numpy array (4D or 5D).
kernel: Numpy array (4D or 5D).
dim_ordering: the data format.
# Returns
Expand All @@ -85,7 +85,7 @@ def convert_kernel(kernel, dim_ordering=None):
if not 4 <= kernel.ndim <= 5:
raise ValueError('Invalid kernel shape:', kernel.shape)

slices = [slice(None, None, -1) for i in range(kernel.ndim)]
slices = [slice(None, None, -1) for _ in range(kernel.ndim)]
no_flip = (slice(None, None), slice(None, None))
if dim_ordering == 'th': # (out_depth, input_depth, ...)
slices[:2] = no_flip
Expand Down

0 comments on commit cfa1f7c

Please sign in to comment.