forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init__.py
121 lines (113 loc) · 5.23 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for creating datasets"""
import os
import math
from .samplers import DistributedBatchSampler
from .datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, bert_sentencepair_dataset, GPT2Dataset
from .lazy_loader import exists_lazy, make_lazy, lazy_array_loader
from .tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, GPT2BPETokenizer, make_tokenizer
from . import corpora
TRAIN_DATA = 0
VAL_DATA = 1
TEST_DATA = 2
def should_split(split):
"""
given split proportions checks if should split
Examples:
>>> should_split([10,0,0])
False
>>> should_split([1,.1,.2])
True
"""
return max(split)/sum(split) != 1.
def get_ext(path):
"""gets path extension"""
return os.path.splitext(path)[1]
def get_dataset(path, **kwargs):
"""gets dataset object based on keyword args and file at `path`"""
if supported_corpus(path):
return corpora.NAMED_CORPORA[path](**kwargs)
ext = get_ext(path)
if '.json' in ext:
text = json_dataset(path, **kwargs)
elif ext in ['.csv', '.tsv']:
text = csv_dataset(path, **kwargs)
else:
raise NotImplementedError('data file type %s is not supported'%(ext))
return text
def supported_corpus(corpus_name):
"""checks if corpus name is defined in `corpora.py`"""
return corpus_name in corpora.NAMED_CORPORA
def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs):
"""function to create datasets+tokenizers for common options"""
if isinstance(process_fn, str):
process_fn = eval(process_fn)
if non_binary_cols is not None:
# multilabel dataset support (only for csvs)
label_key = non_binary_cols
def get_dataset_from_path(path_):
if lazy:
# get lazily loaded dataset
named_corpora = False
if supported_corpus(path_):
named_corpora = True
name = path_
path_ = corpora.NAMED_CORPORA[path_].PATH
if not exists_lazy(path_, data_type='data'):
# create cached version of dataset for lazy loading if it doesn't exist
text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose)
make_lazy(path_, text.X, data_type='data')
text = lazy_array_loader(path_, data_type='data', map_fn=process_fn)
else:
# get dataset
text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent,
delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn)
return text
# get one or multiple datasets and concatenate
if isinstance(path, str):
path = [path]
datasets = [get_dataset_from_path(p) for p in path]
if len(datasets) == 1:
ds = datasets[0]
else:
ds = ConcatDataset(datasets)
# make tokenizer for dataset
if tokenizer is None:
tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type,
pad_token, character_converage, **kwargs)
ds_type = ''
if 'ds_type' in kwargs:
ds_type = kwargs['ds_type']
ds.SetTokenizer(tokenizer)
# Split dataset into train/val/test (and wrap bert dataset)
if should_split(split):
ds = split_ds(ds, split)
if ds_type.lower() == 'bert':
presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds]
elif ds_type.lower() == 'gpt2':
ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
else:
if ds_type.lower() == 'bert':
presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False
ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences)
elif ds_type.lower() == 'gpt2':
ds = GPT2Dataset(ds, max_seq_len=seq_length)
return ds, tokenizer