Skip to content

Commit

Permalink
Merge pull request RoaringBitmap#395 from damnever/perf/readfrom-nocopy
Browse files Browse the repository at this point in the history
Add FromUnsafeBytes to prevent small allocations caused by ByteInputAdapter
  • Loading branch information
lemire authored Aug 17, 2023
2 parents c3f199b + ba2dbcb commit 952b765
Show file tree
Hide file tree
Showing 5 changed files with 227 additions and 42 deletions.
46 changes: 39 additions & 7 deletions internal/byte_input.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,25 @@ type ByteBuffer struct {
off int
}

// NewByteBuffer creates a new ByteBuffer.
func NewByteBuffer(buf []byte) *ByteBuffer {
return &ByteBuffer{
buf: buf,
}
}

var _ io.Reader = (*ByteBuffer)(nil)

// Read implements io.Reader.
func (b *ByteBuffer) Read(p []byte) (int, error) {
data, err := b.Next(len(p))
if err != nil {
return 0, err
}
copy(p, data)
return len(data), nil
}

// Next returns a slice containing the next n bytes from the reader
// If there are fewer bytes than the given n, io.ErrUnexpectedEOF will be returned
func (b *ByteBuffer) Next(n int) ([]byte, error) {
Expand Down Expand Up @@ -109,26 +128,39 @@ func (b *ByteBuffer) Reset(buf []byte) {
type ByteInputAdapter struct {
r io.Reader
readBytes int
buf [4]byte
}

var _ io.Reader = (*ByteInputAdapter)(nil)

// Read implements io.Reader.
func (b *ByteInputAdapter) Read(buf []byte) (int, error) {
m, err := io.ReadAtLeast(b.r, buf, len(buf))
b.readBytes += m

if err != nil {
return 0, err
}

return m, nil
}

// Next returns a slice containing the next n bytes from the buffer,
// advancing the buffer as if the bytes had been returned by Read.
func (b *ByteInputAdapter) Next(n int) ([]byte, error) {
buf := make([]byte, n)
m, err := io.ReadAtLeast(b.r, buf, n)
b.readBytes += m
_, err := b.Read(buf)

if err != nil {
return nil, err
}

return buf, nil
}

// ReadUInt32 reads uint32 with LittleEndian order
func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {
buf, err := b.Next(4)

buf := b.buf[:4]
_, err := b.Read(buf)
if err != nil {
return 0, err
}
Expand All @@ -138,8 +170,8 @@ func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {

// ReadUInt16 reads uint16 with LittleEndian order
func (b *ByteInputAdapter) ReadUInt16() (uint16, error) {
buf, err := b.Next(2)

buf := b.buf[:2]
_, err := b.Read(buf)
if err != nil {
return 0, err
}
Expand Down
45 changes: 28 additions & 17 deletions roaring.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func (rb *Bitmap) ToBytes() ([]byte, error) {
func (rb *Bitmap) Checksum() uint64 {
const (
offset = 14695981039346656037
prime = 1099511628211
prime = 1099511628211
)

var bytes []byte
Expand Down Expand Up @@ -106,6 +106,14 @@ func (rb *Bitmap) Checksum() uint64 {
return hash
}

// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) {
stream := internal.NewByteBuffer(data)
return rb.ReadFrom(stream)
}

// ReadFrom reads a serialized version of this bitmap from stream.
// The format is compatible with other RoaringBitmap
// implementations (Java, C) and is documented here:
Expand All @@ -114,12 +122,18 @@ func (rb *Bitmap) Checksum() uint64 {
// So add cookieHeader to accept the 4-byte data that has been read in roaring64.ReadFrom.
// It is not necessary to pass cookieHeader when call roaring.ReadFrom to read the roaring32 data directly.
func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err error) {
stream := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
stream.Reset(reader)
stream, ok := reader.(internal.ByteInput)
if !ok {
byteInputAdapter := internal.ByteInputAdapterPool.Get().(*internal.ByteInputAdapter)
byteInputAdapter.Reset(reader)
stream = byteInputAdapter
}

p, err = rb.highlowcontainer.readFrom(stream, cookieHeader...)
internal.ByteInputAdapterPool.Put(stream)

if !ok {
internal.ByteInputAdapterPool.Put(stream.(*internal.ByteInputAdapter))
}
return
}

Expand All @@ -144,7 +158,6 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err
// bitmap derived from this bitmap (e.g., via Or, And) might
// also be broken. Thus, before making buf unavailable, you should
// call CloneCopyOnWriteContainers on all such bitmaps.
//
func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) {
stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer)
stream.Reset(buf)
Expand Down Expand Up @@ -276,9 +289,9 @@ type intIterator struct {
// This way, instead of making up-to 64k allocations per full iteration
// we get a single allocation and simply reinitialize the appropriate
// iterator and point to it in the generic `iter` member on each key bound.
shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerShortIterator
shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerShortIterator
}

// HasNext returns true if there are more integers to iterate over
Expand Down Expand Up @@ -341,7 +354,6 @@ func (ii *intIterator) AdvanceIfNeeded(minval uint32) {
// IntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
type IntIterator = intIterator


// Initialize configures the existing iterator so that it can iterate through the values of
// the provided bitmap.
// The iteration results are undefined if the bitmap is modified (e.g., with Add or Remove).
Expand All @@ -357,9 +369,9 @@ type intReverseIterator struct {
iter shortIterable
highlowcontainer *roaringArray

shortIter reverseIterator
runIter runReverseIterator16
bitmapIter reverseBitmapContainerShortIterator
shortIter reverseIterator
runIter runReverseIterator16
bitmapIter reverseBitmapContainerShortIterator
}

// HasNext returns true if there are more integers to iterate over
Expand Down Expand Up @@ -434,9 +446,9 @@ type manyIntIterator struct {
iter manyIterable
highlowcontainer *roaringArray

shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerManyIterator
shortIter shortIterator
runIter runIterator16
bitmapIter bitmapContainerManyIterator
}

func (ii *manyIntIterator) init() {
Expand Down Expand Up @@ -495,7 +507,6 @@ func (ii *manyIntIterator) NextMany64(hs64 uint64, buf []uint64) int {
return n
}


// ManyIntIterator is meant to allow you to iterate through the values of a bitmap, see Initialize(a *Bitmap)
type ManyIntIterator = manyIntIterator

Expand Down Expand Up @@ -569,7 +580,7 @@ func (rb *Bitmap) Iterate(cb func(x uint32) bool) {
// Iterator creates a new IntPeekable to iterate over the integers contained in the bitmap, in sorted order;
// the iterator becomes invalid if the bitmap is modified (e.g., with Add or Remove).
func (rb *Bitmap) Iterator() IntPeekable {
p := new(intIterator)
p := new(intIterator)
p.Initialize(rb)
return p
}
Expand Down
115 changes: 104 additions & 11 deletions roaring64/roaring64.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strconv"

"github.com/RoaringBitmap/roaring"
"github.com/RoaringBitmap/roaring/internal"
)

const serialCookieNoRunContainer = 12346 // only arrays and bitmaps
Expand Down Expand Up @@ -61,7 +62,7 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
}
n += int64(written)
pos := 0
keyBuf := make([]byte, 4)
keyBuf := buf[:4]
for pos < rb.highlowcontainer.size() {
c := rb.highlowcontainer.getContainerAtIndex(pos)
binary.LittleEndian.PutUint32(keyBuf, rb.highlowcontainer.getKeyAtIndex(pos))
Expand All @@ -80,6 +81,86 @@ func (rb *Bitmap) WriteTo(stream io.Writer) (int64, error) {
return n, nil
}

// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
func (rb *Bitmap) FromUnsafeBytes(data []byte) (p int64, err error) {
stream := internal.NewByteBuffer(data)

cookie, r32, p, err := tryReadFromRoaring32ByteBuffer(rb, stream)
if err != nil {
return p, err
} else if r32 {
return p, nil
}

var sizeBuf [8]byte // Avoid changing the original byte slice.
sizeBuf2, err := stream.Next(4)
if err != nil {
return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read number of containers: %w", err)
}
p += 4
copy(sizeBuf[:], cookie)
copy(sizeBuf[4:], sizeBuf2)
size := binary.LittleEndian.Uint64(sizeBuf[:])

rb.highlowcontainer.resize(0)
if cap(rb.highlowcontainer.keys) >= int(size) {
rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size]
} else {
rb.highlowcontainer.keys = make([]uint32, size)
}
if cap(rb.highlowcontainer.containers) >= int(size) {
rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size]
} else {
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
}
if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) {
rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size]
} else {
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
}
for i := uint64(0); i < size; i++ {
keyBuf, err := stream.Next(4)
if err != nil {
return 0, fmt.Errorf("error in bitmap.UnsafeFromBytes: could not read key #%d: %w", i, err)
}
p += 4
rb.highlowcontainer.keys[i] = binary.LittleEndian.Uint32(keyBuf)
rb.highlowcontainer.containers[i] = roaring.NewBitmap()
n, err := rb.highlowcontainer.containers[i].ReadFrom(stream)
if n == 0 || err != nil {
return int64(n), fmt.Errorf("Could not deserialize bitmap for key #%d: %s", i, err)
}
p += int64(n)
}

return p, nil
}

func tryReadFromRoaring32ByteBuffer(rb *Bitmap, stream *internal.ByteBuffer) (cookie []byte, r32 bool, p int64, err error) {
// Verify the first two bytes are a valid MagicNumber.
cookie, err = stream.Next(4)
if err != nil {
return cookie, false, 0, err
}
fileMagic := int(binary.LittleEndian.Uint16(cookie[0:2]))
if fileMagic == serialCookieNoRunContainer || fileMagic == serialCookie {
bm32 := roaring.NewBitmap()
p, err = bm32.ReadFrom(stream, cookie...)
if err != nil {
return
}
// Try reuse the underlying slices.
rb.highlowcontainer.resize(0)
rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0)
rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32)
rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false)
return cookie, true, p, nil
}
return
}

// ReadFrom reads a serialized version of this bitmap from stream.
// The format is compatible with other 64-bit RoaringBitmap
// implementations (Java, Go, C++) and it has a specification :
Expand All @@ -103,11 +184,23 @@ func (rb *Bitmap) ReadFrom(stream io.Reader) (p int64, err error) {
sizeBuf = append(cookie, sizeBuf...)

size := binary.LittleEndian.Uint64(sizeBuf)
rb.highlowcontainer = roaringArray64{}
rb.highlowcontainer.keys = make([]uint32, size)
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
keyBuf := make([]byte, 4)
rb.highlowcontainer.resize(0)
if cap(rb.highlowcontainer.keys) >= int(size) {
rb.highlowcontainer.keys = rb.highlowcontainer.keys[:size]
} else {
rb.highlowcontainer.keys = make([]uint32, size)
}
if cap(rb.highlowcontainer.containers) >= int(size) {
rb.highlowcontainer.containers = rb.highlowcontainer.containers[:size]
} else {
rb.highlowcontainer.containers = make([]*roaring.Bitmap, size)
}
if cap(rb.highlowcontainer.needCopyOnWrite) >= int(size) {
rb.highlowcontainer.needCopyOnWrite = rb.highlowcontainer.needCopyOnWrite[:size]
} else {
rb.highlowcontainer.needCopyOnWrite = make([]bool, size)
}
keyBuf := sizeBuf[:4]
for i := uint64(0); i < size; i++ {
n, err = stream.Read(keyBuf)
if n == 0 || err != nil {
Expand Down Expand Up @@ -140,11 +233,11 @@ func tryReadFromRoaring32(rb *Bitmap, stream io.Reader) (cookie []byte, r32 bool
if err != nil {
return
}
rb.highlowcontainer = roaringArray64{
keys: []uint32{0},
containers: []*roaring.Bitmap{bm32},
needCopyOnWrite: []bool{false},
}
// Try reuse the underlying slices.
rb.highlowcontainer.resize(0)
rb.highlowcontainer.keys = append(rb.highlowcontainer.keys, 0)
rb.highlowcontainer.containers = append(rb.highlowcontainer.containers, bm32)
rb.highlowcontainer.needCopyOnWrite = append(rb.highlowcontainer.needCopyOnWrite, false)
return cookie, true, p, nil
}
return
Expand Down
2 changes: 2 additions & 0 deletions roaring64/roaringarray64.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ func (ra *roaringArray64) removeIndexRange(begin, end int) {

func (ra *roaringArray64) resize(newsize int) {
for k := newsize; k < len(ra.containers); k++ {
ra.keys[k] = 0
ra.needCopyOnWrite[k] = false
ra.containers[k] = nil
}

Expand Down
Loading

0 comments on commit 952b765

Please sign in to comment.