|
| 1 | +import tarfile |
| 2 | +import tempfile |
| 3 | + |
| 4 | +import backports.lzma as lzma |
| 5 | +import simplejson as json |
| 6 | + |
| 7 | +import cache |
| 8 | +import rqueue |
| 9 | +import storage.local |
| 10 | +store = storage.load() |
| 11 | + |
| 12 | +FILE_TYPES = { |
| 13 | + tarfile.REGTYPE: 'f', |
| 14 | + tarfile.DIRTYPE: 'd', |
| 15 | + tarfile.LNKTYPE: 'l', |
| 16 | + tarfile.SYMTYPE: 's', |
| 17 | + tarfile.CHRTYPE: 'c', |
| 18 | + tarfile.BLKTYPE: 'b', |
| 19 | +} |
| 20 | + |
| 21 | +# queue for requesting diff calculations from workers |
| 22 | +diff_queue = rqueue.CappedCollection(cache.redis_conn, "diff-worker", 1024) |
| 23 | + |
| 24 | + |
| 25 | +def generate_ancestry(image_id, parent_id=None): |
| 26 | + if not parent_id: |
| 27 | + store.put_content(store.image_ancestry_path(image_id), |
| 28 | + json.dumps([image_id])) |
| 29 | + return |
| 30 | + data = store.get_content(store.image_ancestry_path(parent_id)) |
| 31 | + data = json.loads(data) |
| 32 | + data.insert(0, image_id) |
| 33 | + store.put_content(store.image_ancestry_path(image_id), json.dumps(data)) |
| 34 | + |
| 35 | + |
| 36 | +class layer_archive(object): |
| 37 | + '''Context manager for untaring a possibly xz/lzma compressed archive.''' |
| 38 | + def __init__(self, fobj): |
| 39 | + self.orig_fobj = fobj |
| 40 | + self.lzma_fobj = None |
| 41 | + self.tar_obj = None |
| 42 | + |
| 43 | + def __enter__(self): |
| 44 | + target_fobj = self.orig_fobj |
| 45 | + try: |
| 46 | + # try to decompress the archive |
| 47 | + self.lzma_fobj = lzma.LZMAFile(filename=target_fobj) |
| 48 | + self.lzma_fobj.read() |
| 49 | + self.lzma_fobj.seek(0) |
| 50 | + except lzma._lzma.LZMAError: |
| 51 | + pass # its okay if we can't |
| 52 | + else: |
| 53 | + target_fobj = self.lzma_fobj |
| 54 | + finally: # reset whatever fp we ended up using |
| 55 | + target_fobj.seek(0) |
| 56 | + |
| 57 | + # untar the fobj, whether it was the original or the lzma |
| 58 | + self.tar_obj = tarfile.open(mode='r|*', fileobj=target_fobj) |
| 59 | + return self.tar_obj |
| 60 | + |
| 61 | + def __exit__(self, type, value, traceback): |
| 62 | + # clean up |
| 63 | + self.tar_obj.close() |
| 64 | + self.lzma_fobj.close() |
| 65 | + self.orig_fobj.seek(0) |
| 66 | + |
| 67 | + |
| 68 | +def serialize_tar_info(tar_info): |
| 69 | + '''serialize a tarfile.TarInfo instance |
| 70 | + Take a single tarfile.TarInfo instance and serialize it to a |
| 71 | + tuple. Consider union whiteouts by filename and mark them as |
| 72 | + deleted in the third element. Don't include union metadata |
| 73 | + files. |
| 74 | + ''' |
| 75 | + is_deleted = False |
| 76 | + filename = tar_info.name |
| 77 | + |
| 78 | + # notice and strip whiteouts |
| 79 | + if filename == ".": |
| 80 | + filename = '/' |
| 81 | + |
| 82 | + if filename.startswith("./"): |
| 83 | + filename = "/" + filename[2:] |
| 84 | + |
| 85 | + if filename.startswith("/.wh."): |
| 86 | + filename = "/" + filename[5:] |
| 87 | + is_deleted = True |
| 88 | + |
| 89 | + if filename.startswith("/.wh."): |
| 90 | + return None |
| 91 | + |
| 92 | + return ( |
| 93 | + filename, |
| 94 | + FILE_TYPES[tar_info.type], |
| 95 | + is_deleted, |
| 96 | + tar_info.size, |
| 97 | + tar_info.mtime, |
| 98 | + tar_info.mode, |
| 99 | + tar_info.uid, |
| 100 | + tar_info.gid, |
| 101 | + ) |
| 102 | + |
| 103 | + |
| 104 | +def read_tarfile(tar_fobj): |
| 105 | + # iterate over each file in the tar and then serialize it |
| 106 | + return [ |
| 107 | + i for i in [serialize_tar_info(m) for m in tar_fobj.getmembers()] |
| 108 | + if i is not None |
| 109 | + ] |
| 110 | + |
| 111 | + |
| 112 | +def get_image_files_cache(image_id): |
| 113 | + image_files_path = store.image_files_path(image_id) |
| 114 | + if store.exists(image_files_path): |
| 115 | + return store.get_content(image_files_path) |
| 116 | + |
| 117 | + |
| 118 | +def set_image_files_cache(image_id, files_json): |
| 119 | + image_files_path = store.image_files_path(image_id) |
| 120 | + store.put_content(image_files_path, files_json) |
| 121 | + |
| 122 | + |
| 123 | +def get_image_files_from_fobj(layer_file): |
| 124 | + '''get files from open file-object containing a layer |
| 125 | +
|
| 126 | + Download the specified layer and determine the file contents. |
| 127 | + Alternatively, process a passed in file-object containing the |
| 128 | + layer data. |
| 129 | +
|
| 130 | + ''' |
| 131 | + layer_file.seek(0) |
| 132 | + with layer_archive(layer_file) as tar_fobj: |
| 133 | + # read passed in tarfile directly |
| 134 | + files = read_tarfile(tar_fobj) |
| 135 | + |
| 136 | + return files |
| 137 | + |
| 138 | + |
| 139 | +def get_image_files_json(image_id): |
| 140 | + '''return json file listing for given image id |
| 141 | + Download the specified layer and determine the file contents. |
| 142 | + Alternatively, process a passed in file-object containing the |
| 143 | + layer data. |
| 144 | + ''' |
| 145 | + files_json = get_image_files_cache(image_id) |
| 146 | + if files_json: |
| 147 | + return files_json |
| 148 | + |
| 149 | + # download remote layer |
| 150 | + image_path = store.image_layer_path(image_id) |
| 151 | + with tempfile.TemporaryFile() as tmp_fobj: |
| 152 | + for buf in store.stream_read(image_path): |
| 153 | + tmp_fobj.write(buf) |
| 154 | + tmp_fobj.seek(0) |
| 155 | + # decompress and untar layer |
| 156 | + files_json = json.dumps(get_image_files_from_fobj(tmp_fobj)) |
| 157 | + set_image_files_cache(image_id, files_json) |
| 158 | + return files_json |
| 159 | + |
| 160 | + |
| 161 | +def get_file_info_map(file_infos): |
| 162 | + '''convert a list of file info tuples to dictionaries |
| 163 | + Convert a list of layer file info tuples to a dictionary using the |
| 164 | + first element (filename) as the key. |
| 165 | + ''' |
| 166 | + return dict((file_info[0], file_info[1:]) for file_info in file_infos) |
| 167 | + |
| 168 | + |
| 169 | +def get_image_diff_cache(image_id): |
| 170 | + image_diff_path = store.image_diff_path(image_id) |
| 171 | + if store.exists(image_diff_path): |
| 172 | + return store.get_content(image_diff_path) |
| 173 | + |
| 174 | + |
| 175 | +def set_image_diff_cache(image_id, diff_json): |
| 176 | + image_diff_path = store.image_diff_path(image_id) |
| 177 | + store.put_content(image_diff_path, diff_json) |
| 178 | + |
| 179 | + |
| 180 | +def get_image_diff_json(image_id): |
| 181 | + '''get json describing file differences in layer |
| 182 | + Calculate the diff information for the files contained within |
| 183 | + the layer. Return a dictionary of lists grouped by whether they |
| 184 | + were deleted, changed or created in this layer. |
| 185 | +
|
| 186 | + To determine what happened to a file in a layer we walk backwards |
| 187 | + through the ancestry until we see the file in an older layer. Based |
| 188 | + on whether the file was previously deleted or not we know whether |
| 189 | + the file was created or modified. If we do not find the file in an |
| 190 | + ancestor we know the file was just created. |
| 191 | +
|
| 192 | + - File marked as deleted by union fs tar: DELETED |
| 193 | + - Ancestor contains non-deleted file: CHANGED |
| 194 | + - Ancestor contains deleted marked file: CREATED |
| 195 | + - No ancestor contains file: CREATED |
| 196 | + ''' |
| 197 | + |
| 198 | + # check the cache first |
| 199 | + diff_json = get_image_diff_cache(image_id) |
| 200 | + if diff_json: |
| 201 | + return diff_json |
| 202 | + |
| 203 | + # we need all ancestral layers to calculate the diff |
| 204 | + ancestry_path = store.image_ancestry_path(image_id) |
| 205 | + ancestry = json.loads(store.get_content(ancestry_path))[1:] |
| 206 | + # grab the files from the layer |
| 207 | + files = json.loads(get_image_files_json(image_id)) |
| 208 | + # convert to a dictionary by filename |
| 209 | + info_map = get_file_info_map(files) |
| 210 | + |
| 211 | + deleted = {} |
| 212 | + changed = {} |
| 213 | + created = {} |
| 214 | + |
| 215 | + # walk backwards in time by iterating the ancestry |
| 216 | + for id in ancestry: |
| 217 | + # get the files from the current ancestor |
| 218 | + ancestor_files = json.loads(get_image_files_json(id)) |
| 219 | + # convert to a dictionary of the files mapped by filename |
| 220 | + ancestor_map = get_file_info_map(ancestor_files) |
| 221 | + # iterate over each of the top layer's files |
| 222 | + for filename, info in info_map.items(): |
| 223 | + ancestor_info = ancestor_map.get(filename) |
| 224 | + # if the file in the top layer is already marked as deleted |
| 225 | + if info[1]: |
| 226 | + deleted[filename] = info |
| 227 | + del info_map[filename] |
| 228 | + # if the file exists in the current ancestor |
| 229 | + elif ancestor_info: |
| 230 | + # if the file was marked as deleted in the ancestor |
| 231 | + if ancestor_info[1]: |
| 232 | + # is must have been just created in the top layer |
| 233 | + created[filename] = info |
| 234 | + else: |
| 235 | + # otherwise it must have simply changed in the top layer |
| 236 | + changed[filename] = info |
| 237 | + del info_map[filename] |
| 238 | + created.update(info_map) |
| 239 | + |
| 240 | + # return dictionary of files grouped by file action |
| 241 | + diff_json = json.dumps({ |
| 242 | + 'deleted': deleted, |
| 243 | + 'changed': changed, |
| 244 | + 'created': created, |
| 245 | + }) |
| 246 | + |
| 247 | + # store results in cache |
| 248 | + set_image_diff_cache(image_id, diff_json) |
| 249 | + |
| 250 | + return diff_json |
0 commit comments