Skip to content

Commit

Permalink
flairNLPGH-856: Fixed deprecated references to TaggedCorpus in experi…
Browse files Browse the repository at this point in the history
…ments and data_fetcher.py docs accordingly to the Corpus update that addressed flairNLPGH-232
  • Loading branch information
aychang95 committed Jul 3, 2019
1 parent 34f2490 commit a670990
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 24 deletions.
16 changes: 8 additions & 8 deletions flair/data_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ def load_corpora(
@deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.")
def load_corpus(task: Union[NLPTask, str], base_path: [str, Path] = None) -> Corpus:
"""
Helper function to fetch a TaggedCorpus for a specific NLPTask. For this to work you need to first download
Helper function to fetch a Corpus for a specific NLPTask. For this to work you need to first download
and put into the appropriate folder structure the corresponding NLP task data. The tutorials on
https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this
code to create your own data fetchers.
:param task: specification of the NLPTask you wish to get
:param base_path: path to data folder containing tasks sub folders
:return: a TaggedCorpus consisting of train, dev and test data
:return: a Corpus consisting of train, dev and test data
"""

# first, try to fetch dataset online
Expand Down Expand Up @@ -257,15 +257,15 @@ def load_column_corpus(
tag_to_biloes=None,
) -> Corpus:
"""
Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
Helper function to get a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
:param data_folder: base folder with the task data
:param column_format: a map specifying the column format
:param train_file: the name of the train file
:param test_file: the name of the test file
:param dev_file: the name of the dev file, if None, dev data is sampled from train
:param tag_to_biloes: whether to convert to BILOES tagging scheme
:return: a TaggedCorpus with annotated train, dev and test data
:return: a Corpus with annotated train, dev and test data
"""

if type(data_folder) == str:
Expand Down Expand Up @@ -353,13 +353,13 @@ def load_ud_corpus(
data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None
) -> Corpus:
"""
Helper function to get a TaggedCorpus from CoNLL-U column-formatted task data such as the UD corpora
Helper function to get a Corpus from CoNLL-U column-formatted task data such as the UD corpora
:param data_folder: base folder with the task data
:param train_file: the name of the train file
:param test_file: the name of the test file
:param dev_file: the name of the dev file, if None, dev data is sampled from train
:return: a TaggedCorpus with annotated train, dev and test data
:return: a Corpus with annotated train, dev and test data
"""
# automatically identify train / test / dev files
if train_file is None:
Expand Down Expand Up @@ -400,13 +400,13 @@ def load_classification_corpus(
max_tokens_per_doc=-1,
) -> Corpus:
"""
Helper function to get a TaggedCorpus from text classification-formatted task data
Helper function to get a Corpus from text classification-formatted task data
:param data_folder: base folder with the task data
:param train_file: the name of the train file
:param test_file: the name of the test file
:param dev_file: the name of the dev file, if None, dev data is sampled from train
:return: a TaggedCorpus with annotated train, dev and test data
:return: a Corpus with annotated train, dev and test data
"""

if type(data_folder) == str:
Expand Down
32 changes: 16 additions & 16 deletions resources/docs/EXPERIMENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ This allows the `NLPTaskDataFetcher` class to read the data into our data struct
the dataset, as follows:

```python
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
```

This gives you a `TaggedCorpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use.
This gives you a `Corpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use.

#### Best Known Configuration

The full code to get a state-of-the-art model for English NER is as follows:

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -108,13 +108,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus
FastText word embeddings and German contextual string embeddings. The full code then is as follows:

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -163,13 +163,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus
FastText word embeddings and German contextual string embeddings. The full code then is as follows:

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -218,13 +218,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus
FastText word embeddings and German contextual string embeddings. The full code then is as follows:

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -286,13 +286,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus
FastText embeddings (they work better on this dataset). The full code then is as follows:

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -354,13 +354,13 @@ so the algorithm knows that POS tags and not NER are to be predicted from this d
#### Best Known Configuration

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks')
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'pos'
Expand Down Expand Up @@ -410,13 +410,13 @@ Run the code with extvec embeddings and our proposed contextual string embedding
so the algorithm knows that chunking tags and not NER are to be predicted from this data.

```python
from flair.data import TaggedCorpus
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000)
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000)

# 2. what tag do we want to predict?
tag_type = 'np'
Expand Down

0 comments on commit a670990

Please sign in to comment.