-
Notifications
You must be signed in to change notification settings - Fork 303
/
Copy pathyaml_file.py
194 lines (150 loc) · 6.51 KB
/
yaml_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import hashlib
import logging
from os import path
import os
import pickle
import re
import tempfile
from . import querylog
from ruamel import yaml
from utils import atomic_write_file
from flask import has_request_context, g
yaml_loader = yaml.YAML(typ="safe", pure=True)
logger = logging.getLogger(__name__)
YAML_FILES_CACHE = {}
class YamlFile:
"""Data from a YAML file, accessible as if it is a dictionary.
Use like so:
file = YamlFile.for_file('path/to/file.yaml')
if file.exists():
print(file['key1'])
else:
print('oh no')
Since loading the YAML files tends to be slow (takes 1s to load all YAML
files for a single language in the course of rendering the main code editor page),
we do some caching work:
- Caches the loaded data in memory, on the Flask 'current request globals' object.
This means that accessing the same file twice in the same request will only load
it once. If we don't do this, loading YAMLs takes 2s instead of 1 because of the
duplicate loads.
- To keep the application memory footprint low, we don't cache the data
permanently, but drop it after the request is done.
- After we have successfully loaded a YAML file, we write a pickled version
of that YAML file to disk, so that we can load the pickled version faster in
the future future (loading pickled data is ~400x faster than parsing a YAML
file).
- Pickled files are purely a server-side optimization. The files should not be checked
in, and in fact by default this class will only generate them when running on Heroku.
- Nevertheless, we do some time stamp checking to make sure to read the pickle
file if its timestamp is older than the YAML file's timestamp.
"""
@staticmethod
def for_file(filename):
"""Factory function: return a singleton YamlFile instance for the given file."""
filename = path.abspath(filename)
if filename not in YAML_FILES_CACHE:
YAML_FILES_CACHE[filename] = YamlFile(filename)
return YAML_FILES_CACHE[filename]
def __init__(self, filename):
"""Create a new YamlFile for the given filename.
try_pickle controls on whether we pickle or not. Can be
`True`, `False` or `None` -- in case of `None` pickling is
determined automatically based on whether or not we appear
to be running on Heroku. We don't pickle on dev workstations
because it creates a mess of files.
"""
self.filename = filename
self.pickle_filename = path.join(tempfile.gettempdir(), 'hedy_pickles',
f"{pathname_slug(self.filename)}.pickle")
def exists(self):
return os.path.exists(self.filename)
def to_dict(self):
"""Return the contents of the file as a plain dict, or an empty dict if the file doesn't exist.
You should generally not need to use this: this object can be used in places
where dicts are expected (except if the file doesn't exist, then accessing it will throw; we
can consider changing that behavior to always return an empty dict).
"""
if self.exists():
return self.access()
return {}
def access(self):
"""Access the data from memory.
Load it if we haven't loaded it yet or the data on disk changed.
"""
# Obtain or create a per-request cache dictionary (if we have a request), or an unattached
# cache object that will disappear after this function returns if we don't have a request.
yaml_cache = g.setdefault('yaml_cache', {}) if has_request_context() else {}
cached = yaml_cache.get(self.filename)
if cached is not None:
return cached
data = self.load()
if not isinstance(data, dict):
raise RuntimeError(f"Contents of {self.filename} needs to be a dict, got: {data}")
yaml_cache[self.filename] = data
return data
def load(self):
"""Load the data from disk.
Load from a pickle file if available, or load the original YAML
and write a pickle file otherwise.
"""
yaml_ts = self._file_timestamp(self.filename)
pickle_ts = self._file_timestamp(self.pickle_filename)
if pickle_ts and yaml_ts and pickle_ts > yaml_ts:
# Pickle file is newer than the YAML, just read that
return self.load_pickle()
# Otherwise load uncached and save (atomically, since multiple processes might
# be trying to write the pickle file in parallel)
data = self.load_uncached()
try:
os.makedirs(path.dirname(self.pickle_filename), exist_ok=True)
with atomic_write_file(self.pickle_filename) as f:
pickle.dump(data, f)
except IOError as e:
logger.warn('Error writing pickled YAML: %s', e)
return data
@querylog.timed_as('load_yaml_pickled')
def load_pickle(self):
with open(self.pickle_filename, "rb") as f:
return pickle.load(f)
@querylog.timed_as('load_yaml_uncached')
def load_uncached(self):
"""Load the source YAML file."""
try:
with open(self.filename, "r", encoding="utf-8") as f:
return yaml_loader.load(f)
except IOError:
return {}
def _file_timestamp(self, filename):
try:
return os.stat(filename).st_mtime
except FileNotFoundError:
return None
# Make this object look like a readonly 'dict'
def __getitem__(self, key):
return self.access()[key]
def get(self, key, default=None):
return self.access().get(key, default)
def has_key(self, k):
return k in self.access()
def keys(self):
return self.access().keys()
def values(self):
return self.access().values()
def items(self):
return self.access().items()
def __contains__(self, item):
return item in self.access()
def __iter__(self):
return iter(self.access())
def __len__(self):
return len(self.access())
def pathname_slug(x):
"""Turn a path name into an identifier we can use as a file name.
Take into account that it may contain characters we want to remove,
that the full path may be too long to use as a file name, and that
it needs to remain unique.
"""
x = re.sub(r'[^a-zA-Z0-9_-]', '', x)
return x[-20:] + md5digest(x)
def md5digest(x):
return hashlib.md5(x.encode("utf-8")).hexdigest()