Skip to content

Commit

Permalink
Fix documentation for gensim.corpora. Partial fix piskvorky#1671 (p…
Browse files Browse the repository at this point in the history
…iskvorky#1729)

* Fix typo

* Make `save_corpus` private

* Annotate `bleicorpus.py`

* Make __save_corpus weakly private

* Fix _save_corpus in tests

* Fix _save_corpus[2]

* Document bleicorpus in Numpy style

* Document indexedcorpus

* Annotate csvcorpus

* Add "Yields" section

* Make `_save_corpus` public

* Annotate bleicorpus

* Fix indentation in bleicorpus

* `_save_corpus` -> `save_corpus`

* Annotate bleicorpus

* Convert dictionary docs to numpy style

* Convert hashdictionary docs to numpy style

* Convert indexedcorpus docs to numpy style

* Convert lowcorpus docs to numpy style

* Convert malletcorpus docs to numpy style

* Convert mmcorpus docs to numpy style

* Convert sharded_corpus docs to numpy style

* Convert svmlightcorpus docs to numpy style

* Convert textcorpus docs to numpy style

* Convert ucicorpus docs to numpy style

* Convert wikicorpus docs to numpy style

* Add sphinx tweaks

* Remove trailing whitespaces

* Annotate wikicorpus

* SVMLight Corpus annotated

* Fix TODO

* Fix grammar mistake

* Undo changes to dictionary

* Undo changes to hashdictionary

* Document indexedcorpus

* Document indexedcorpus[2]

Fix identation

* Remove redundant files

* Add more dots. :)

* Fix monospace

* remove useless method

* fix bleicorpus

* fix csvcorpus

* fix indexedcorpus

* fix svmlightcorpus

* fix wikicorpus[1]

* fix wikicorpus[2]

* fix wikicorpus[3]

* fix review comments
  • Loading branch information
anotherbugmaster authored and menshikh-iv committed Jan 22, 2018
1 parent 74dae4d commit c5f487d
Show file tree
Hide file tree
Showing 5 changed files with 553 additions and 202 deletions.
101 changes: 78 additions & 23 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Blei's LDA-C format.
"""
"""Сorpus in Blei's LDA-C format."""

from __future__ import with_statement

Expand All @@ -19,30 +17,44 @@
from six.moves import xrange


logger = logging.getLogger('gensim.corpora.bleicorpus')
logger = logging.getLogger(__name__)


class BleiCorpus(IndexedCorpus):
"""
Corpus in Blei's LDA-C format.
"""Corpus in Blei's LDA-C format.
The corpus is represented as two files: one describing the documents, and another
describing the mapping between words and their ids.
Each document is one line::
N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
The vocabulary is a file with words, one word per line; word at line K has an implicit `id=K`.
The vocabulary is a file with words, one word per line; word at line K has an
implicit ``id=K``.
"""

def __init__(self, fname, fname_vocab=None):
"""
Initialize the corpus from a file.
`fname_vocab` is the file with vocabulary; if not specified, it defaults to
`fname.vocab`.
Parameters
----------
fname : str
Path to corpus.
fname_vocab : str, optional
Vocabulary file. If `fname_vocab` is None, searching one of variants:
* `fname`.vocab
* `fname`/vocab.txt
* `fname_without_ext`.vocab
* `fname_folder`/vocab.txt
Raises
------
IOError
If vocabulary file doesn't exist.
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s", fname)
Expand All @@ -67,8 +79,13 @@ def __init__(self, fname, fname_vocab=None):
self.id2word = dict(enumerate(words))

def __iter__(self):
"""
Iterate over the corpus, returning one sparse vector at a time.
"""Iterate over the corpus, returning one sparse (BoW) vector at a time.
Yields
------
list of (int, float)
Document's BoW representation.
"""
lineno = -1
with utils.smart_open(self.fname) as fin:
Expand All @@ -77,6 +94,19 @@ def __iter__(self):
self.length = lineno + 1

def line2doc(self, line):
"""Convert line in Blei LDA-C format to document (BoW representation).
Parameters
----------
line : str
Line in Blei's LDA-C format.
Returns
-------
list of (int, float)
Document's BoW representation.
"""
parts = utils.to_unicode(line).split()
if int(parts[0]) != len(parts) - 1:
raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
Expand All @@ -86,14 +116,28 @@ def line2doc(self, line):

@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the LDA-C format.
There are actually two files saved: `fname` and `fname.vocab`, where
`fname.vocab` is the vocabulary file.
"""Save a corpus in the LDA-C format.
Notes
-----
There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.
Parameters
----------
fname : str
Path to output file.
corpus : iterable of iterable of (int, float)
Input corpus in BoW format.
id2word : dict of (str, str), optional
Mapping id -> word for `corpus`.
metadata : bool, optional
THIS PARAMETER WILL BE IGNORED.
Returns
-------
list of int
Offsets for each line in file (in bytes).
This function is automatically called by `BleiCorpus.serialize`; don't
call it directly, call `serialize` instead.
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
Expand Down Expand Up @@ -121,8 +165,19 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
return offsets

def docbyoffset(self, offset):
"""
Return the document stored at file position `offset`.
"""Get document corresponding to `offset`.
Offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`.
Parameters
----------
offset : int
Position of the document in the file (in bytes).
Returns
-------
list of (int, float)
Document in BoW format.
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
Expand Down
31 changes: 19 additions & 12 deletions gensim/corpora/csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
# Copyright (C) 2013 Zygmunt Zając <[email protected]>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Corpus in CSV format.
"""
"""Corpus in CSV format."""


from __future__ import with_statement
Expand All @@ -18,22 +15,28 @@

from gensim import interfaces, utils

logger = logging.getLogger('gensim.corpora.csvcorpus')
logger = logging.getLogger(__name__)


class CsvCorpus(interfaces.CorpusABC):
"""
Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
based on the file content.
"""Corpus in CSV format.
Notes
-----
The CSV delimiter, headers etc. are guessed automatically based on the file content.
All row values are expected to be ints/floats.
"""

def __init__(self, fname, labels):
"""
Initialize the corpus from a file.
`labels` = are class labels present in the input file? => skip the first column
Parameters
----------
fname : str
Path to corpus.
labels : bool
If True - ignore first column (class labels).
"""
logger.info("loading corpus from %s", fname)
Expand All @@ -48,8 +51,12 @@ def __init__(self, fname, labels):
logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)

def __iter__(self):
"""
Iterate over the corpus, returning one sparse vector at a time.
"""Iterate over the corpus, returning one BoW vector at a time.
Yields
------
list of (int, float)
Document in BoW format.
"""
reader = csv.reader(utils.smart_open(self.fname), self.dialect)
Expand Down
Loading

0 comments on commit c5f487d

Please sign in to comment.