forked from pytorch/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
204 additions
and
212 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,34 @@ | ||
# Word-level language modeling RNN | ||
|
||
This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task. | ||
By default, the training script uses the PTB dataset, provided. | ||
By default, the training script uses the PTB dataset, provided. | ||
The trained model can then be used by the generate script to generate new text. | ||
|
||
```bash | ||
python main.py -cuda # Train an LSTM on ptb with cuda (cuDNN). Should reach perplexity of 116 | ||
python generate.py # Generate samples from the trained LSTM model. | ||
python main.py --cuda # Train an LSTM on ptb with cuda (cuDNN). Should reach perplexity of 113 | ||
python generate.py # Generate samples from the trained LSTM model. | ||
``` | ||
|
||
The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`) which will automatically use the cuDNN backend if run on CUDA with cuDNN installed. | ||
The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`) | ||
which will automatically use the cuDNN backend if run on CUDA with cuDNN installed. | ||
|
||
The `main.py` script accepts the following arguments: | ||
|
||
```bash | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
-data DATA Location of the data corpus | ||
-model MODEL Type of recurrent net. RNN_TANH, RNN_RELU, LSTM, or | ||
GRU. | ||
-emsize EMSIZE Size of word embeddings | ||
-nhid NHID Number of hidden units per layer. | ||
-nlayers NLAYERS Number of layers. | ||
-lr LR Initial learning rate. | ||
-clip CLIP Gradient clipping. | ||
-maxepoch MAXEPOCH Upper epoch limit. | ||
-batchsize BATCHSIZE Batch size. | ||
-bptt BPTT Sequence length. | ||
-seed SEED Random seed. | ||
-cuda Use CUDA. | ||
-reportint REPORTINT Report interval. | ||
-save SAVE Path to save the final model. | ||
-h, --help show this help message and exit | ||
--data DATA location of the data corpus | ||
--model MODEL type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU) | ||
--emsize EMSIZE size of word embeddings | ||
--nhid NHID humber of hidden units per layer | ||
--nlayers NLAYERS number of layers | ||
--lr LR initial learning rate | ||
--clip CLIP gradient clipping | ||
--epochs EPOCHS upper epoch limit | ||
--batch-size N batch size | ||
--bptt BPTT sequence length | ||
--seed SEED random seed | ||
--cuda use CUDA | ||
--log-interval N report interval | ||
--save SAVE path to save the final model | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,48 @@ | ||
######################################## | ||
# Data Fetching Script for PTB | ||
######################################## | ||
|
||
import os | ||
import torch | ||
import os.path | ||
|
||
class Dictionary(object): | ||
def __init__(self): | ||
self.word2idx = {} | ||
self.idx2word = [] | ||
|
||
def addword(self, word): | ||
def add_word(self, word): | ||
if word not in self.word2idx: | ||
self.idx2word.append(word) | ||
self.word2idx[word] = len(self.idx2word) - 1 | ||
|
||
return self.word2idx[word] | ||
|
||
def ntokens(self): | ||
def __len__(self): | ||
return len(self.idx2word) | ||
|
||
|
||
class Corpus(object): | ||
def __init__(self, path): | ||
self.dic = Dictionary() | ||
self.train=self._loadfile(os.path.join(path, 'train.txt')) | ||
self.valid=self._loadfile(os.path.join(path, 'valid.txt')) | ||
self.test =self._loadfile(os.path.join(path, 'test.txt')) | ||
|
||
# | Tokenize a text file. | ||
def _loadfile(self, path): | ||
# Read words from file. | ||
assert(os.path.exists(path)) | ||
tokens = 0 | ||
self.dictionary = Dictionary() | ||
self.train = self.tokenize(os.path.join(path, 'train.txt')) | ||
self.valid = self.tokenize(os.path.join(path, 'valid.txt')) | ||
self.test = self.tokenize(os.path.join(path, 'test.txt')) | ||
|
||
def tokenize(self, path): | ||
"""Tokenizes a text file.""" | ||
assert os.path.exists(path) | ||
# Add words to the dictionary | ||
with open(path, 'r') as f: | ||
tokens = 0 | ||
for line in f: | ||
words = line.split() + ['<eos>'] | ||
tokens += len(words) | ||
for word in words: | ||
self.dic.addword(word) | ||
tokens += 1 | ||
|
||
self.dictionary.add_word(word) | ||
|
||
# Tokenize file content | ||
with open(path, 'r') as f: | ||
ids = torch.LongTensor(tokens) | ||
token = 0 | ||
for line in f: | ||
words = line.split() + ['<eos>'] | ||
for word in words: | ||
ids[token] = self.dic.word2idx[word] | ||
ids[token] = self.dictionary.word2idx[word] | ||
token += 1 | ||
|
||
# Final dataset. | ||
|
||
return ids |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.