Skip to content

Commit

Permalink
Reduce memory usage for chunk_hashes
Browse files Browse the repository at this point in the history
On a 180MB file this reduced total memory
usage by approximately 40%.  This was also marginally
faster (but not by much).

I've also added the start of unittests for the writer module,
and I've written some very basic unittests for the chunk_hashes
function.
  • Loading branch information
jamesls committed Sep 6, 2012
1 parent f1b007e commit f117db5
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 9 deletions.
20 changes: 11 additions & 9 deletions boto/glacier/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,17 @@
import json


def chunk_hashes(str):
"""
Break up the byte-string into 1MB chunks and return sha256 hashes
for each.
"""
chunk = 1024 * 1024
chunk_count = int(math.ceil(len(str) / float(chunk)))
chunks = [str[i * chunk:(i + 1) * chunk] for i in range(chunk_count)]
return [hashlib.sha256(x).digest() for x in chunks]
_ONE_MEGABYTE = 1024 * 1024


def chunk_hashes(bytestring, chunk_size=_ONE_MEGABYTE):
chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))
hashes = []
for i in xrange(chunk_count):
start = i * chunk_size
end = (i + 1) * chunk_size
hashes.append(hashlib.sha256(bytestring[start:end]).digest())
return hashes


def tree_hash(fo):
Expand Down
26 changes: 26 additions & 0 deletions tests/unit/glacier/test_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from hashlib import sha256

from tests.unit import unittest
import mock

from boto.glacier.writer import Writer, chunk_hashes


class TestChunking(unittest.TestCase):
def test_chunk_hashes_exact(self):
chunks = chunk_hashes('a' * (2 * 1024 * 1024))
self.assertEqual(len(chunks), 2)
self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest())

def test_chunks_with_leftovers(self):
bytestring = 'a' * (2 * 1024 * 1024 + 20)
chunks = chunk_hashes(bytestring)
self.assertEqual(len(chunks), 3)
self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest())
self.assertEqual(chunks[1], sha256('a' * 1024 * 1024).digest())
self.assertEqual(chunks[2], sha256('a' * 20).digest())

def test_less_than_one_chunk(self):
chunks = chunk_hashes('aaaa')
self.assertEqual(len(chunks), 1)
self.assertEqual(chunks[0], sha256('aaaa').digest())

0 comments on commit f117db5

Please sign in to comment.