forked from oscarknagg/few-shot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
206 lines (160 loc) · 7.35 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
from torch.utils.data import Dataset
import torch
from PIL import Image
from torchvision import transforms
from skimage import io
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from config import DATA_PATH
class OmniglotDataset(Dataset):
def __init__(self, subset):
"""Dataset class representing Omniglot dataset
# Arguments:
subset: Whether the dataset represents the background or evaluation set
"""
if subset not in ('background', 'evaluation'):
raise(ValueError, 'subset must be one of (background, evaluation)')
self.subset = subset
self.df = pd.DataFrame(self.index_subset(self.subset))
# Index of dataframe has direct correspondence to item in dataset
self.df = self.df.assign(id=self.df.index.values)
# Convert arbitrary class names of dataset to ordered 0-(num_speakers - 1) integers
self.unique_characters = sorted(self.df['class_name'].unique())
self.class_name_to_id = {self.unique_characters[i]: i for i in range(self.num_classes())}
self.df = self.df.assign(class_id=self.df['class_name'].apply(lambda c: self.class_name_to_id[c]))
# Create dicts
self.datasetid_to_filepath = self.df.to_dict()['filepath']
self.datasetid_to_class_id = self.df.to_dict()['class_id']
def __getitem__(self, item):
instance = io.imread(self.datasetid_to_filepath[item])
# Reindex to channels first format as supported by pytorch
instance = instance[np.newaxis, :, :]
# Normalise to 0-1
instance = (instance - instance.min()) / (instance.max() - instance.min())
label = self.datasetid_to_class_id[item]
return torch.from_numpy(instance), label
def __len__(self):
return len(self.df)
def num_classes(self):
return len(self.df['class_name'].unique())
@staticmethod
def index_subset(subset):
"""Index a subset by looping through all of its files and recording relevant information.
# Arguments
subset: Name of the subset
# Returns
A list of dicts containing information about all the image files in a particular subset of the
Omniglot dataset dataset
"""
images = []
print('Indexing {}...'.format(subset))
# Quick first pass to find total for tqdm bar
subset_len = 0
for root, folders, files in os.walk(DATA_PATH + '/Omniglot/images_{}/'.format(subset)):
subset_len += len([f for f in files if f.endswith('.png')])
progress_bar = tqdm(total=subset_len)
for root, folders, files in os.walk(DATA_PATH + '/Omniglot/images_{}/'.format(subset)):
if len(files) == 0:
continue
alphabet = root.split('/')[-2]
class_name = '{}.{}'.format(alphabet, root.split('/')[-1])
for f in files:
progress_bar.update(1)
images.append({
'subset': subset,
'alphabet': alphabet,
'class_name': class_name,
'filepath': os.path.join(root, f)
})
progress_bar.close()
return images
class MiniImageNet(Dataset):
def __init__(self, subset):
"""Dataset class representing miniImageNet dataset
# Arguments:
subset: Whether the dataset represents the background or evaluation set
"""
if subset not in ('background', 'evaluation'):
raise(ValueError, 'subset must be one of (background, evaluation)')
self.subset = subset
self.df = pd.DataFrame(self.index_subset(self.subset))
# Index of dataframe has direct correspondence to item in dataset
self.df = self.df.assign(id=self.df.index.values)
# Convert arbitrary class names of dataset to ordered 0-(num_speakers - 1) integers
self.unique_characters = sorted(self.df['class_name'].unique())
self.class_name_to_id = {self.unique_characters[i]: i for i in range(self.num_classes())}
self.df = self.df.assign(class_id=self.df['class_name'].apply(lambda c: self.class_name_to_id[c]))
# Create dicts
self.datasetid_to_filepath = self.df.to_dict()['filepath']
self.datasetid_to_class_id = self.df.to_dict()['class_id']
# Setup transforms
self.transform = transforms.Compose([
transforms.CenterCrop(224),
transforms.Resize(84),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
def __getitem__(self, item):
instance = Image.open(self.datasetid_to_filepath[item])
instance = self.transform(instance)
label = self.datasetid_to_class_id[item]
return instance, label
def __len__(self):
return len(self.df)
def num_classes(self):
return len(self.df['class_name'].unique())
@staticmethod
def index_subset(subset):
"""Index a subset by looping through all of its files and recording relevant information.
# Arguments
subset: Name of the subset
# Returns
A list of dicts containing information about all the image files in a particular subset of the
miniImageNet dataset
"""
images = []
print('Indexing {}...'.format(subset))
# Quick first pass to find total for tqdm bar
subset_len = 0
for root, folders, files in os.walk(DATA_PATH + '/miniImageNet/images_{}/'.format(subset)):
subset_len += len([f for f in files if f.endswith('.png')])
progress_bar = tqdm(total=subset_len)
for root, folders, files in os.walk(DATA_PATH + '/miniImageNet/images_{}/'.format(subset)):
if len(files) == 0:
continue
class_name = root.split('/')[-1]
for f in files:
progress_bar.update(1)
images.append({
'subset': subset,
'class_name': class_name,
'filepath': os.path.join(root, f)
})
progress_bar.close()
return images
class DummyDataset(Dataset):
def __init__(self, samples_per_class=10, n_classes=10, n_features=1):
"""Dummy dataset for debugging/testing purposes
A sample from the DummyDataset has (n_features + 1) features. The first feature is the index of the sample
in the data and the remaining features are the class index.
# Arguments
samples_per_class: Number of samples per class in the dataset
n_classes: Number of distinct classes in the dataset
n_features: Number of extra features each sample should have.
"""
self.samples_per_class = samples_per_class
self.n_classes = n_classes
self.n_features = n_features
# Create a dataframe to be consistent with other Datasets
self.df = pd.DataFrame({
'class_id': [i % self.n_classes for i in range(len(self))]
})
self.df = self.df.assign(id=self.df.index.values)
def __len__(self):
return self.samples_per_class * self.n_classes
def __getitem__(self, item):
class_id = item % self.n_classes
return np.array([item] + [class_id]*self.n_features, dtype=np.float), float(class_id)