Skip to content

Commit

Permalink
Merge pull request RoaringBitmap#397 from RoaringBitmap/issue396
Browse files Browse the repository at this point in the history
Verifying and fixing issue 396 :  `UnmarshalBinary`  or `FromBase64` should not have their containers marked as needing copy-on-write
  • Loading branch information
lemire authored Aug 25, 2023
2 parents 507997c + 32b6f65 commit c7bccc2
Show file tree
Hide file tree
Showing 11 changed files with 106 additions and 71 deletions.
3 changes: 0 additions & 3 deletions benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (

// BENCHMARKS, to run them type "go test -bench Benchmark -run -"


// go test -bench BenchmarkIteratorAlloc -benchmem -run -
func BenchmarkIteratorAlloc(b *testing.B) {
bm := NewBitmap()
Expand Down Expand Up @@ -84,7 +83,6 @@ func BenchmarkIteratorAlloc(b *testing.B) {
b.Fatalf("Cardinalities don't match: %d, %d", counter, expected_cardinality)
}


b.Run("many iteration with alloc", func(b *testing.B) {
for n := 0; n < b.N; n++ {
counter = 0
Expand Down Expand Up @@ -117,7 +115,6 @@ func BenchmarkIteratorAlloc(b *testing.B) {
}
}


// go test -bench BenchmarkOrs -benchmem -run -
func BenchmarkOrs(b *testing.B) {

Expand Down
1 change: 0 additions & 1 deletion bitmapcontainer.go
Original file line number Diff line number Diff line change
Expand Up @@ -1062,7 +1062,6 @@ func (bc *bitmapContainer) PrevSetBit(i int) int {

// reference the java implementation
// https://github.com/RoaringBitmap/RoaringBitmap/blob/master/src/main/java/org/roaringbitmap/BitmapContainer.java#L875-L892
//
func (bc *bitmapContainer) numberOfRuns() int {
if bc.cardinality == 0 {
return 0
Expand Down
17 changes: 17 additions & 0 deletions internal/byte_input.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ type ByteInput interface {
// Next returns a slice containing the next n bytes from the buffer,
// advancing the buffer as if the bytes had been returned by Read.
Next(n int) ([]byte, error)
// NextReturnsSafeSlice returns true if Next() returns a safe slice as opposed
// to a slice that points to an underlying buffer possibly owned by another system.
// When NextReturnsSafeSlice returns false, the result from Next() should be copied
// before it is modified (i.e., it is immutable).
NextReturnsSafeSlice() bool
// ReadUInt32 reads uint32 with LittleEndian order
ReadUInt32() (uint32, error)
// ReadUInt16 reads uint16 with LittleEndian order
Expand Down Expand Up @@ -76,6 +81,12 @@ func (b *ByteBuffer) Next(n int) ([]byte, error) {
return data, nil
}

// NextReturnsSafeSlice returns false since ByteBuffer might hold
// an array owned by some other systems.
func (b *ByteBuffer) NextReturnsSafeSlice() bool {
return false
}

// ReadUInt32 reads uint32 with LittleEndian order
func (b *ByteBuffer) ReadUInt32() (uint32, error) {
if len(b.buf)-b.off < 4 {
Expand Down Expand Up @@ -157,6 +168,12 @@ func (b *ByteInputAdapter) Next(n int) ([]byte, error) {
return buf, nil
}

// NextReturnsSafeSlice returns true since ByteInputAdapter always returns a slice
// allocated with make([]byte, ...)
func (b *ByteInputAdapter) NextReturnsSafeSlice() bool {
return true
}

// ReadUInt32 reads uint32 with LittleEndian order
func (b *ByteInputAdapter) ReadUInt32() (uint32, error) {
buf := b.buf[:4]
Expand Down
12 changes: 12 additions & 0 deletions roaring.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ func (rb *Bitmap) Checksum() uint64 {
// FromUnsafeBytes reads a serialized version of this bitmap from the byte buffer without copy.
// It is the caller's responsibility to ensure that the input data is not modified and remains valid for the entire lifetime of this bitmap.
// This method avoids small allocations but holds references to the input data buffer. It is GC-friendly, but it may consume more memory eventually.
// The containers in the resulting bitmap are immutable containers tied to the provided byte array and they rely on
// copy-on-write which means that modifying them creates copies. Thus FromUnsafeBytes is more likely to be appropriate for read-only use cases,
// when the resulting bitmap can be considered immutable.
//
// See also the FromBuffer function.
// See https://github.com/RoaringBitmap/roaring/pull/395 for more details.
func (rb *Bitmap) FromUnsafeBytes(data []byte, cookieHeader ...byte) (p int64, err error) {
stream := internal.NewByteBuffer(data)
return rb.ReadFrom(stream)
Expand Down Expand Up @@ -153,11 +159,17 @@ func (rb *Bitmap) ReadFrom(reader io.Reader, cookieHeader ...byte) (p int64, err
// You should *not* change the copy-on-write status of the resulting
// bitmaps (SetCopyOnWrite).
//
// Thus FromBuffer is more likely to be appropriate for read-only use cases,
// when the resulting bitmap can be considered immutable.
//
// If buf becomes unavailable, then a bitmap created with
// FromBuffer would be effectively broken. Furthermore, any
// bitmap derived from this bitmap (e.g., via Or, And) might
// also be broken. Thus, before making buf unavailable, you should
// call CloneCopyOnWriteContainers on all such bitmaps.
//
// See also the FromUnsafeBytes function which can have better performance
// in some cases.
func (rb *Bitmap) FromBuffer(buf []byte) (p int64, err error) {
stream := internal.ByteBufferPool.Get().(*internal.ByteBuffer)
stream.Reset(buf)
Expand Down
6 changes: 3 additions & 3 deletions roaring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ func hashTest(t *testing.T, N uint64) {
rb1, rb2 = NewBitmap(), NewBitmap()
for x := uint64(0); x <= N*gap; x += gap {
// x+3 guarantees runs, gap/2 guarantees some variety
if x + 3 + gap/2 > MaxUint32 {
if x+3+gap/2 > MaxUint32 {
break
}
rb1.AddRange(uint64(x), uint64(x + 3 + gap/2))
rb2.AddRange(uint64(x), uint64(x + 3 + gap/2))
rb1.AddRange(uint64(x), uint64(x+3+gap/2))
rb2.AddRange(uint64(x), uint64(x+3+gap/2))
}

rb1.RunOptimize()
Expand Down
19 changes: 10 additions & 9 deletions roaringarray.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import (
"bytes"
"encoding/binary"
"fmt"
"github.com/RoaringBitmap/roaring/internal"
"io"

"github.com/RoaringBitmap/roaring/internal"
)

type container interface {
Expand Down Expand Up @@ -112,9 +113,10 @@ func newRoaringArray() *roaringArray {
// runOptimize compresses the element containers to minimize space consumed.
// Q: how does this interact with copyOnWrite and needCopyOnWrite?
// A: since we aren't changing the logical content, just the representation,
// we don't bother to check the needCopyOnWrite bits. We replace
// (possibly all) elements of ra.containers in-place with space
// optimized versions.
//
// we don't bother to check the needCopyOnWrite bits. We replace
// (possibly all) elements of ra.containers in-place with space
// optimized versions.
func (ra *roaringArray) runOptimize() {
for i := range ra.containers {
ra.containers[i] = ra.containers[i].toEfficientContainer()
Expand Down Expand Up @@ -465,9 +467,7 @@ func (ra *roaringArray) serializedSizeInBytes() uint64 {
return answer
}

//
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
//
func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) {
hasRun := ra.hasRunCompression()
isRunSizeInBytes := 0
Expand Down Expand Up @@ -544,15 +544,14 @@ func (ra *roaringArray) writeTo(w io.Writer) (n int64, err error) {
return n, nil
}

//
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
//
func (ra *roaringArray) toBytes() ([]byte, error) {
var buf bytes.Buffer
_, err := ra.writeTo(&buf)
return buf.Bytes(), err
}

// Reads a serialized roaringArray from a byte slice.
func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte) (int64, error) {
var cookie uint32
var err error
Expand All @@ -567,6 +566,8 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte
return stream.GetReadBytes(), fmt.Errorf("error in roaringArray.readFrom: could not read initial cookie: %s", err)
}
}
// If NextReturnsSafeSlice is false, then willNeedCopyOnWrite should be true
willNeedCopyOnWrite := !stream.NextReturnsSafeSlice()

var size uint32
var isRunBitmap []byte
Expand Down Expand Up @@ -631,7 +632,7 @@ func (ra *roaringArray) readFrom(stream internal.ByteInput, cookieHeader ...byte
key := keycard[2*i]
card := int(keycard[2*i+1]) + 1
ra.keys[i] = key
ra.needCopyOnWrite[i] = true
ra.needCopyOnWrite[i] = willNeedCopyOnWrite

if isRunBitmap != nil && isRunBitmap[i/8]&(1<<(i%8)) != 0 {
// run container
Expand Down
48 changes: 20 additions & 28 deletions runcontainer.go
Original file line number Diff line number Diff line change
Expand Up @@ -253,10 +253,8 @@ func newRunContainer16FromBitmapContainer(bc *bitmapContainer) *runContainer16 {

}

//
// newRunContainer16FromArray populates a new
// runContainer16 from the contents of arr.
//
func newRunContainer16FromArray(arr *arrayContainer) *runContainer16 {
// keep this in sync with newRunContainer16FromVals above

Expand Down Expand Up @@ -834,24 +832,23 @@ func (rc *runContainer16) numIntervals() int {
// If key is not already present, then whichInterval16 is
// set as follows:
//
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
// last interval16 in rc.iv;
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
// last interval16 in rc.iv;
//
// b) whichInterval16 == -1 if key is before our first
// interval16 in rc.iv;
// b) whichInterval16 == -1 if key is before our first
// interval16 in rc.iv;
//
// c) whichInterval16 is set to the minimum index of rc.iv
// which comes strictly before the key;
// so rc.iv[whichInterval16].last < key,
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
// (Note that whichInterval16+1 won't exist when
// whichInterval16 is the last interval.)
// c) whichInterval16 is set to the minimum index of rc.iv
// which comes strictly before the key;
// so rc.iv[whichInterval16].last < key,
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
// (Note that whichInterval16+1 won't exist when
// whichInterval16 is the last interval.)
//
// runContainer16.search always returns whichInterval16 < len(rc.iv).
//
// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there
// no upper bound.
//
func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
n := int(len(rc.iv))
if n == 0 {
Expand Down Expand Up @@ -937,21 +934,20 @@ func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (w
// If key is not already present, then whichInterval16 is
// set as follows:
//
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
// last interval16 in rc.iv;
// a) whichInterval16 == len(rc.iv)-1 if key is beyond our
// last interval16 in rc.iv;
//
// b) whichInterval16 == -1 if key is before our first
// interval16 in rc.iv;
// b) whichInterval16 == -1 if key is before our first
// interval16 in rc.iv;
//
// c) whichInterval16 is set to the minimum index of rc.iv
// which comes strictly before the key;
// so rc.iv[whichInterval16].last < key,
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
// (Note that whichInterval16+1 won't exist when
// whichInterval16 is the last interval.)
// c) whichInterval16 is set to the minimum index of rc.iv
// which comes strictly before the key;
// so rc.iv[whichInterval16].last < key,
// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start
// (Note that whichInterval16+1 won't exist when
// whichInterval16 is the last interval.)
//
// runContainer16.search always returns whichInterval16 < len(rc.iv).
//
func (rc *runContainer16) search(key int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
return rc.searchRange(key, 0, 0)
}
Expand Down Expand Up @@ -994,7 +990,6 @@ func newRunContainer16() *runContainer16 {

// newRunContainer16CopyIv creates a run container, initializing
// with a copy of the supplied iv slice.
//
func newRunContainer16CopyIv(iv []interval16) *runContainer16 {
rc := &runContainer16{
iv: make([]interval16, len(iv)),
Expand All @@ -1011,7 +1006,6 @@ func (rc *runContainer16) Clone() *runContainer16 {
// newRunContainer16TakeOwnership returns a new runContainer16
// backed by the provided iv slice, which we will
// assume exclusive control over from now on.
//
func newRunContainer16TakeOwnership(iv []interval16) *runContainer16 {
rc := &runContainer16{
iv: iv,
Expand Down Expand Up @@ -2006,7 +2000,6 @@ func (rc *runContainer16) not(firstOfRange, endx int) container {
// Current routine is correct but
// makes 2 more passes through the arrays than should be
// strictly necessary. Measure both ways though--this may not matter.
//
func (rc *runContainer16) Not(firstOfRange, endx int) *runContainer16 {

if firstOfRange > endx {
Expand Down Expand Up @@ -2329,7 +2322,6 @@ func runArrayUnionToRuns(rc *runContainer16, ac *arrayContainer) ([]interval16,
// the backing array, and then you write
// the answer at the beginning. What this
// trick does is minimize memory allocations.
//
func (rc *runContainer16) lazyIOR(a container) container {
// not lazy at the moment
return rc.ior(a)
Expand Down
1 change: 0 additions & 1 deletion serialization.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (

// writeTo for runContainer16 follows this
// spec: https://github.com/RoaringBitmap/RoaringFormatSpec
//
func (b *runContainer16) writeTo(stream io.Writer) (int, error) {
buf := make([]byte, 2+4*len(b.iv))
binary.LittleEndian.PutUint16(buf[0:], uint16(len(b.iv)))
Expand Down
4 changes: 2 additions & 2 deletions serialization_frozen_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ package roaring

import (
"bytes"
"fmt"
"io/ioutil"
"os"
"reflect"
"testing"
"os"
"fmt"
)

func TestFrozenFormat(t *testing.T) {
Expand Down
15 changes: 7 additions & 8 deletions serialization_littleendian.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@ func (bc *bitmapContainer) asLittleEndianByteSlice() []byte {

// Deserialization code follows

////
// //
// These methods (byteSliceAsUint16Slice,...) do not make copies,
// they are pointer-based (unsafe). The caller is responsible to
// ensure that the input slice does not get garbage collected, deleted
// or modified while you hold the returned slince.
////
// //
func byteSliceAsUint16Slice(slice []byte) (result []uint16) { // here we create a new slice holder
if len(slice)%2 != 0 {
panic("Slice size should be divisible by 2")
Expand Down Expand Up @@ -295,7 +295,6 @@ func byteSliceAsBoolSlice(slice []byte) (result []bool) {
// bitmap derived from this bitmap (e.g., via Or, And) might
// also be broken. Thus, before making buf unavailable, you should
// call CloneCopyOnWriteContainers on all such bitmaps.
//
func (rb *Bitmap) FrozenView(buf []byte) error {
return rb.highlowcontainer.frozenView(buf)
}
Expand Down Expand Up @@ -412,11 +411,11 @@ func (ra *roaringArray) frozenView(buf []byte) error {
}

var c container
containersSz := int(unsafe.Sizeof(c))*nCont
bitsetsSz := int(unsafe.Sizeof(bitmapContainer{}))*nBitmap
arraysSz := int(unsafe.Sizeof(arrayContainer{}))*nArray
runsSz := int(unsafe.Sizeof(runContainer16{}))*nRun
needCOWSz := int(unsafe.Sizeof(true))*nCont
containersSz := int(unsafe.Sizeof(c)) * nCont
bitsetsSz := int(unsafe.Sizeof(bitmapContainer{})) * nBitmap
arraysSz := int(unsafe.Sizeof(arrayContainer{})) * nArray
runsSz := int(unsafe.Sizeof(runContainer16{})) * nRun
needCOWSz := int(unsafe.Sizeof(true)) * nCont

bitmapArenaSz := containersSz + bitsetsSz + arraysSz + runsSz + needCOWSz
bitmapArena := make([]byte, bitmapArenaSz)
Expand Down
Loading

0 comments on commit c7bccc2

Please sign in to comment.