From a67099045d90a749ec77b7ffad3a8b30dd6c5163 Mon Sep 17 00:00:00 2001 From: Andrew Chang Date: Wed, 3 Jul 2019 12:42:45 -0400 Subject: [PATCH] GH-856: Fixed deprecated references to TaggedCorpus in experiments and data_fetcher.py docs accordingly to the Corpus update that addressed GH-232 --- flair/data_fetcher.py | 16 ++++++++-------- resources/docs/EXPERIMENTS.md | 32 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/flair/data_fetcher.py b/flair/data_fetcher.py index de5afac04..bfd32ac91 100644 --- a/flair/data_fetcher.py +++ b/flair/data_fetcher.py @@ -122,13 +122,13 @@ def load_corpora( @deprecated(version="0.4.1", reason="Use 'flair.datasets' instead.") def load_corpus(task: Union[NLPTask, str], base_path: [str, Path] = None) -> Corpus: """ - Helper function to fetch a TaggedCorpus for a specific NLPTask. For this to work you need to first download + Helper function to fetch a Corpus for a specific NLPTask. For this to work you need to first download and put into the appropriate folder structure the corresponding NLP task data. The tutorials on https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this code to create your own data fetchers. :param task: specification of the NLPTask you wish to get :param base_path: path to data folder containing tasks sub folders - :return: a TaggedCorpus consisting of train, dev and test data + :return: a Corpus consisting of train, dev and test data """ # first, try to fetch dataset online @@ -257,7 +257,7 @@ def load_column_corpus( tag_to_biloes=None, ) -> Corpus: """ - Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. + Helper function to get a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task data :param column_format: a map specifying the column format @@ -265,7 +265,7 @@ def load_column_corpus( :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param tag_to_biloes: whether to convert to BILOES tagging scheme - :return: a TaggedCorpus with annotated train, dev and test data + :return: a Corpus with annotated train, dev and test data """ if type(data_folder) == str: @@ -353,13 +353,13 @@ def load_ud_corpus( data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None ) -> Corpus: """ - Helper function to get a TaggedCorpus from CoNLL-U column-formatted task data such as the UD corpora + Helper function to get a Corpus from CoNLL-U column-formatted task data such as the UD corpora :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train - :return: a TaggedCorpus with annotated train, dev and test data + :return: a Corpus with annotated train, dev and test data """ # automatically identify train / test / dev files if train_file is None: @@ -400,13 +400,13 @@ def load_classification_corpus( max_tokens_per_doc=-1, ) -> Corpus: """ - Helper function to get a TaggedCorpus from text classification-formatted task data + Helper function to get a Corpus from text classification-formatted task data :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train - :return: a TaggedCorpus with annotated train, dev and test data + :return: a Corpus with annotated train, dev and test data """ if type(data_folder) == str: diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md index 6c6525a22..6d24cbeb4 100644 --- a/resources/docs/EXPERIMENTS.md +++ b/resources/docs/EXPERIMENTS.md @@ -30,23 +30,23 @@ This allows the `NLPTaskDataFetcher` class to read the data into our data struct the dataset, as follows: ```python -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks') ``` -This gives you a `TaggedCorpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use. +This gives you a `Corpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use. #### Best Known Configuration The full code to get a state-of-the-art model for English NER is as follows: ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -108,13 +108,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus FastText word embeddings and German contextual string embeddings. The full code then is as follows: ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -163,13 +163,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus FastText word embeddings and German contextual string embeddings. The full code then is as follows: ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -218,13 +218,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus FastText word embeddings and German contextual string embeddings. The full code then is as follows: ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -286,13 +286,13 @@ Once you have the data, reproduce our experiments exactly like for CoNLL-03, jus FastText embeddings (they work better on this dataset). The full code then is as follows: ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -354,13 +354,13 @@ so the algorithm knows that POS tags and not NER are to be predicted from this d #### Best Known Configuration ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks') +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'pos' @@ -410,13 +410,13 @@ Run the code with extvec embeddings and our proposed contextual string embedding so the algorithm knows that chunking tags and not NER are to be predicted from this data. ```python -from flair.data import TaggedCorpus +from flair.data import Corpus from flair.data_fetcher import NLPTaskDataFetcher, NLPTask from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000) +corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000) # 2. what tag do we want to predict? tag_type = 'np'