-
Notifications
You must be signed in to change notification settings - Fork 3
/
checkpoints.py
141 lines (109 loc) · 4.24 KB
/
checkpoints.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import os
import torch
class Checkpoint(object):
def __init__(self, path, max_n=3):
self.path = path
self.max_n = max_n
self.models = {}
self.checkpoints = []
def add_model(self, name, model, opt=None):
assert not name in self.models, "Model {} already added".format(name)
self.models[name] = {}
self.models[name]['model'] = model
self.models[name]['opt'] = opt
def limit(self):
return self.max_n
def add_checkpoints(self, name=None):
# searching for names
fns = os.listdir(self.path)
fns = filter(lambda x: x[-4:] == '.pth', fns)
names = {}
for fn in fns:
sfx = fn.split("_")[-1].rstrip('.pth')
path = self._get_full_path(fn)
if not sfx in names:
names[sfx] = os.path.getmtime(path)
else:
names[sfx] = max(names[sfx], os.path.getmtime(path))
# assembling
names_and_time = []
for sfx, time in names.items():
exists, paths = self.find(sfx)
if exists:
names_and_time.append((sfx, time))
# if there are more checkpoints
# than we can handle, remove the older ones
# but do not remove them (for safety)
if len(names_and_time) > self.max_n:
names_and_time = sorted(names_and_time, \
key=lambda x: x[1], \
reverse=False)
new_checkpoints = []
for key in names_and_time[-self.max_n:]:
new_checkpoints.append(key[0])
self.checkpoints = new_checkpoints
def __len__(self):
return len(self.checkpoints)
def _get_full_path(self, filename):
return os.path.join(self.path, filename)
def clean(self, n_remove):
n_remove = min(n_remove, len(self.checkpoints))
for i in range(n_remove):
sfx = self.checkpoints[i]
for name, data in self.models.items():
for d in ('model', 'opt'):
fn = self._filename(d, name, sfx)
self._rm(fn)
removed = self.checkpoints[:n_remove]
self.checkpoints = self.checkpoints[n_remove:]
return removed
def _rm(self, fn):
path = self._get_full_path(fn)
if os.path.isfile(path):
os.remove(path)
def _filename(self, d, name, suffix):
return "{}_{}_{}.pth".format(d, name, suffix)
def load(self, suffix):
if suffix is None:
return False
found, paths = self.find(suffix)
if not found:
return False
# loading
for name, data in self.models.items():
for d in ('model', 'opt'):
if data[d] is not None:
data[d].load_state_dict(torch.load(paths[name][d]))
return True
def find(self, suffix, force=False):
paths = {}
found = True
for name, data in self.models.items():
paths[name] = {}
for d in ('model', 'opt'):
fn = self._filename(d, name, suffix)
path = self._get_full_path(fn)
paths[name][d] = path
if not os.path.isfile(path):
print("File not found: ", path)
if d == 'model':
found = False
if found and not suffix in self.checkpoints:
if len(self.checkpoints) < self.max_n or force:
self.checkpoints.insert(0, suffix)
if force:
self.max_n = max(self.max_n, len(self.checkpoints))
return found, paths
def checkpoint(self, suffix):
assert not '_' in suffix, "Underscores are not allowed"
self.checkpoints.append(suffix)
for name, data in self.models.items():
for d in ('model', 'opt'):
fn = self._filename(d, name, suffix)
path = self._get_full_path(fn)
if not os.path.isfile(path) and data[d] is not None:
torch.save(data[d].state_dict(), path)
# removing
n_remove = max(0, len(self.checkpoints) - self.max_n)
removed = self.clean(n_remove)
return removed