Skip to content

Commit

Permalink
Merge pull request fastnlp#91 from FengZiYjun/master
Browse files Browse the repository at this point in the history
Merge Preprocessor into DataSet.
  • Loading branch information
FengZiYjun authored Oct 1, 2018
2 parents 281b567 + 81790d7 commit 8b6d082
Show file tree
Hide file tree
Showing 52 changed files with 2,249 additions and 67,893 deletions.
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ python:
install:
- pip install --quiet -r requirements.txt
- pip install pytest pytest-cov
- pip install -U scikit-learn
# command to run tests
script:
- pytest --cov=./
Expand Down
101 changes: 30 additions & 71 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,77 +30,36 @@ Run the following commands to install fastNLP package.
pip install fastNLP
```

### Cloning From GitHub

If you just want to use fastNLP, use:
```shell
git clone https://github.com/fastnlp/fastNLP
cd fastNLP
```

### PyTorch Installation

Visit the [PyTorch official website] for installation instructions based on your system. In general, you could use:
```shell
# using conda
conda install pytorch torchvision -c pytorch
# or using pip
pip3 install torch torchvision
```

### TensorboardX Installation

```shell
pip3 install tensorboardX
```

## Project Structure

```
FastNLP
├── docs
├── fastNLP
│   ├── core
│   │   ├── action.py
│   │   ├── __init__.py
│   │   ├── loss.py
│   │   ├── metrics.py
│   │   ├── optimizer.py
│   │   ├── predictor.py
│   │   ├── preprocess.py
│   │   ├── README.md
│   │   ├── tester.py
│   │   └── trainer.py
│   ├── fastnlp.py
│   ├── __init__.py
│   ├── loader
│   │   ├── base_loader.py
│   │   ├── config_loader.py
│   │   ├── dataset_loader.py
│   │   ├── embed_loader.py
│   │   ├── __init__.py
│   │   └── model_loader.py
│   ├── models
│   ├── modules
│   │   ├── aggregation
│   │   ├── decoder
│   │   ├── encoder
│   │   ├── __init__.py
│   │   ├── interaction
│   │   ├── other_modules.py
│   │   └── utils.py
│   └── saver
├── LICENSE
├── README.md
├── reproduction
├── requirements.txt
├── setup.py
└── test
├── core
├── data_for_tests
├── __init__.py
├── loader
├── modules
└── readme_example.py
```
<table>
<tr>
<td><b> fastNLP </b></td>
<td> an open-source NLP library </td>
</tr>
<tr>
<td><b> fastNLP.core </b></td>
<td> trainer, tester, predictor </td>
</tr>
<tr>
<td><b> fastNLP.loader </b></td>
<td> all kinds of loaders/readers </td>
</tr>
<tr>
<td><b> fastNLP.models </b></td>
<td> a collection of NLP models </td>
</tr>
<tr>
<td><b> fastNLP.modules </b></td>
<td> a collection of PyTorch sub-models/components/wheels </td>
</tr>
<tr>
<td><b> fastNLP.saver </b></td>
<td> all kinds of savers/writers </td>
</tr>
<tr>
<td><b> fastNLP.fastnlp </b></td>
<td> a high-level interface for prediction </td>
</tr>
</table>
4 changes: 2 additions & 2 deletions docs/source/user/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pre-processing data, constructing model and training model.
from fastNLP.modules import aggregation
from fastNLP.modules import decoder
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.dataset_loader import ClassDataSetLoader
from fastNLP.loader.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.inference import ClassificationInfer
Expand Down Expand Up @@ -50,7 +50,7 @@ pre-processing data, constructing model and training model.
train_path = 'test/data_for_tests/text_classify.txt' # training set file
# load dataset
ds_loader = ClassDatasetLoader("train", train_path)
ds_loader = ClassDataSetLoader("train", train_path)
data = ds_loader.load()
# pre-process dataset
Expand Down
4 changes: 2 additions & 2 deletions examples/readme_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from fastNLP.core.predictor import ClassificationInfer
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.dataset_loader import ClassDataSetLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregator
from fastNLP.modules import decoder
Expand Down Expand Up @@ -36,7 +36,7 @@ def forward(self, x):
train_path = './data_for_tests/text_classify.txt' # training set file

# load dataset
ds_loader = ClassDatasetLoader(train_path)
ds_loader = ClassDataSetLoader()
data = ds_loader.load()

# pre-process dataset
Expand Down
21 changes: 4 additions & 17 deletions fastNLP/core/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def __init__(self, dataset, batch_size, sampler, use_cuda):
:param dataset: a DataSet object
:param batch_size: int, the size of the batch
:param sampler: a Sampler object
:param use_cuda: bool, whetjher to use GPU
:param use_cuda: bool, whether to use GPU
"""
self.dataset = dataset
Expand All @@ -37,15 +37,12 @@ def __next__(self):
"""
:return batch_x: dict of (str: torch.LongTensor), which means (field name: tensor of shape [batch_size, padding_length])
batch_x also contains an item (str: list of int) about origin lengths,
which means ("field_name_origin_len": origin lengths).
E.g.
::
{'text': tensor([[ 0, 1, 2, 3, 0, 0, 0], 4, 5, 2, 6, 7, 8, 9]]), 'text_origin_len': [4, 7]})
batch_y: dict of (str: torch.LongTensor), which means (field name: tensor of shape [batch_size, padding_length])
All tensors in both batch_x and batch_y will be cuda tensors if use_cuda is True.
The names of fields are defined in preprocessor's convert_to_dataset method.
"""
if self.curidx >= len(self.idx_list):
Expand All @@ -54,34 +51,24 @@ def __next__(self):
endidx = min(self.curidx + self.batch_size, len(self.idx_list))
padding_length = {field_name: max(field_length[self.curidx: endidx])
for field_name, field_length in self.lengths.items()}
origin_lengths = {field_name: field_length[self.curidx: endidx]
for field_name, field_length in self.lengths.items()}

batch_x, batch_y = defaultdict(list), defaultdict(list)

# transform index to tensor and do padding for sequences
for idx in range(self.curidx, endidx):
x, y = self.dataset.to_tensor(idx, padding_length)
for name, tensor in x.items():
batch_x[name].append(tensor)
for name, tensor in y.items():
batch_y[name].append(tensor)

batch_origin_length = {}
# combine instances into a batch
# combine instances to form a batch
for batch in (batch_x, batch_y):
for name, tensor_list in batch.items():
if self.use_cuda:
batch[name] = torch.stack(tensor_list, dim=0).cuda()
else:
batch[name] = torch.stack(tensor_list, dim=0)

# add origin lengths in batch_x
for name, tensor in batch_x.items():
if self.use_cuda:
batch_origin_length[name + "_origin_len"] = torch.LongTensor(origin_lengths[name]).cuda()
else:
batch_origin_length[name + "_origin_len"] = torch.LongTensor(origin_lengths[name])
batch_x.update(batch_origin_length)

self.curidx = endidx
return batch_x, batch_y

Loading

0 comments on commit 8b6d082

Please sign in to comment.