From bb268bd9a9b03edd0d17adfa5c1e917f2c34afc7 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 14 Apr 2021 17:43:24 -0400 Subject: [PATCH] Trimming the run containers so that they use less memory and are safer. --- roaring_test.go | 2 +- roaringcow_test.go | 2 +- runcontainer.go | 98 ++++++++++++++++++++------------------------ runcontainer_test.go | 75 --------------------------------- 4 files changed, 46 insertions(+), 131 deletions(-) diff --git a/roaring_test.go b/roaring_test.go index 5042e1c8..4481c1e0 100644 --- a/roaring_test.go +++ b/roaring_test.go @@ -2091,7 +2091,7 @@ func TestStats(t *testing.T) { if intSize == 4 { runContainerBytes = 40 } else { - runContainerBytes = 52 + runContainerBytes = 36 } expectedStats := Statistics{ diff --git a/roaringcow_test.go b/roaringcow_test.go index 7a937944..dec622d3 100644 --- a/roaringcow_test.go +++ b/roaringcow_test.go @@ -1871,7 +1871,7 @@ func TestStatsCOW(t *testing.T) { if intSize == 4 { runContainerBytes = 40 } else { - runContainerBytes = 52 + runContainerBytes = 36 } expectedStats := Statistics{ diff --git a/runcontainer.go b/runcontainer.go index 2e799c7e..97c00365 100644 --- a/runcontainer.go +++ b/runcontainer.go @@ -49,9 +49,6 @@ import ( type runContainer16 struct { iv []interval16 card int64 - - // avoid allocation during search - myOpts searchOptions `msg:"-"` } // interval16 is the internal to runContainer16 @@ -613,10 +610,7 @@ func (rc *runContainer16) unionCardinality(b *runContainer16) uint64 { // indexOfIntervalAtOrAfter is a helper for union. func (rc *runContainer16) indexOfIntervalAtOrAfter(key int64, startIndex int64) int64 { - rc.myOpts.startIndex = startIndex - rc.myOpts.endxIndex = 0 - - w, already, _ := rc.search(key, &rc.myOpts) + w, already, _ := rc.searchRange(key, startIndex, 0) if already { return w } @@ -840,7 +834,7 @@ toploop: // get returns true iff key is in the container. func (rc *runContainer16) contains(key uint16) bool { - _, in, _ := rc.search(int64(key), nil) + _, in, _ := rc.search(int64(key)) return in } @@ -849,22 +843,7 @@ func (rc *runContainer16) numIntervals() int { return len(rc.iv) } -// searchOptions allows us to accelerate search with -// prior knowledge of (mostly lower) bounds. This is used by Union -// and Intersect. -type searchOptions struct { - // start here instead of at 0 - startIndex int64 - - // upper bound instead of len(rc.iv); - // endxIndex == 0 means ignore the bound and use - // endxIndex == n ==len(rc.iv) which is also - // naturally the default for search() - // when opt = nil. - endxIndex int64 -} - -// search returns alreadyPresent to indicate if the +// searchRange returns alreadyPresent to indicate if the // key is already in one of our interval16s. // // If key is alreadyPresent, then whichInterval16 tells @@ -888,24 +867,16 @@ type searchOptions struct { // // runContainer16.search always returns whichInterval16 < len(rc.iv). // -// If not nil, opts can be used to further restrict -// the search space. +// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there +// no upper bound. // -func (rc *runContainer16) search(key int64, opts *searchOptions) (whichInterval16 int64, alreadyPresent bool, numCompares int) { +func (rc *runContainer16) searchRange(key int64, startIndex int64, endxIndex int64) (whichInterval16 int64, alreadyPresent bool, numCompares int) { n := int64(len(rc.iv)) if n == 0 { return -1, false, 0 } - - startIndex := int64(0) - endxIndex := n - if opts != nil { - startIndex = opts.startIndex - - // let endxIndex == 0 mean no effect - if opts.endxIndex > 0 { - endxIndex = opts.endxIndex - } + if endxIndex == 0 { + endxIndex = n } // sort.Search returns the smallest index i @@ -975,6 +946,34 @@ func (rc *runContainer16) search(key int64, opts *searchOptions) (whichInterval1 return } +// search returns alreadyPresent to indicate if the +// key is already in one of our interval16s. +// +// If key is alreadyPresent, then whichInterval16 tells +// you where. +// +// If key is not already present, then whichInterval16 is +// set as follows: +// +// a) whichInterval16 == len(rc.iv)-1 if key is beyond our +// last interval16 in rc.iv; +// +// b) whichInterval16 == -1 if key is before our first +// interval16 in rc.iv; +// +// c) whichInterval16 is set to the minimum index of rc.iv +// which comes strictly before the key; +// so rc.iv[whichInterval16].last < key, +// and if whichInterval16+1 exists, then key < rc.iv[whichInterval16+1].start +// (Note that whichInterval16+1 won't exist when +// whichInterval16 is the last interval.) +// +// runContainer16.search always returns whichInterval16 < len(rc.iv). +// +func (rc *runContainer16) search(key int64) (whichInterval16 int64, alreadyPresent bool, numCompares int) { + return rc.searchRange(key, 0, 0) +} + // cardinality returns the count of the integers stored in the // runContainer16. func (rc *runContainer16) cardinality() int64 { @@ -1068,7 +1067,7 @@ func (rc *runContainer16) Add(k uint16) (wasNew bool) { k64 := int64(k) - index, present, _ := rc.search(k64, nil) + index, present, _ := rc.search(k64) if present { return // already there } @@ -1201,13 +1200,8 @@ func (ri *runIterator16) advanceIfNeeded(minval uint16) { return } - opt := &searchOptions{ - startIndex: ri.curIndex, - endxIndex: int64(len(ri.rc.iv)), - } - // interval cannot be -1 because of minval > peekNext - interval, isPresent, _ := ri.rc.search(int64(minval), opt) + interval, isPresent, _ := ri.rc.searchRange(int64(minval), ri.curIndex, int64(len(ri.rc.iv))) // if the minval is present, set the curPosIndex at the right position if isPresent { @@ -1360,7 +1354,7 @@ func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int { func (rc *runContainer16) removeKey(key uint16) (wasPresent bool) { var index int64 - index, wasPresent, _ = rc.search(int64(key), nil) + index, wasPresent, _ = rc.search(int64(key)) if !wasPresent { return // already removed, nothing to do. } @@ -1451,12 +1445,8 @@ func intersectWithLeftover16(astart, alast, bstart, blast int64) (isOverlap, isL return } -func (rc *runContainer16) findNextIntervalThatIntersectsStartingFrom(startIndex int64, key int64) (index int64, done bool) { - - rc.myOpts.startIndex = startIndex - rc.myOpts.endxIndex = 0 - - w, _, _ := rc.search(key, &rc.myOpts) +func (rc *runContainer16) findNextIntervalThatIntersectsStartingFrom(startIndex int64, key int64) (index int64, done bool) { + w, _, _ := rc.searchRange(key, startIndex, 0) // rc.search always returns w < len(rc.iv) if w < startIndex { // not found and comes before lower bound startIndex, @@ -1597,8 +1587,8 @@ func (rc *runContainer16) isubtract(del interval16) { } // INVAR there is some intersection between rc and del - istart, startAlready, _ := rc.search(int64(del.start), nil) - ilast, lastAlready, _ := rc.search(int64(del.last()), nil) + istart, startAlready, _ := rc.search(int64(del.start)) + ilast, lastAlready, _ := rc.search(int64(del.last())) rc.card = -1 if istart == -1 { if ilast == n-1 && !lastAlready { @@ -2350,7 +2340,7 @@ func (rc *runContainer16) getCardinality() int { func (rc *runContainer16) rank(x uint16) int { n := int64(len(rc.iv)) xx := int64(x) - w, already, _ := rc.search(xx, nil) + w, already, _ := rc.search(xx) if w < 0 { return 0 } diff --git a/runcontainer_test.go b/runcontainer_test.go index f0d4d722..eef09ca2 100644 --- a/runcontainer_test.go +++ b/runcontainer_test.go @@ -366,81 +366,6 @@ func TestRleRunReverseIterator16(t *testing.T) { }) } -func TestRleRunSearch16(t *testing.T) { - - t.Run("RunContainer16.search should respect the prior bounds we provide for efficiency of searching through a subset of the intervals", func(t *testing.T) { - { - vals := []uint16{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, MaxUint16 - 3, MaxUint16} - exAt := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11} // expected at - absent := []uint16{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, MaxUint16 - 2} - - rc := newRunContainer16FromVals(true, vals...) - - assert.EqualValues(t, 12, rc.cardinality()) - - var where int64 - var present bool - - for i, v := range vals { - where, present, _ = rc.search(int64(v), nil) - assert.True(t, present) - assert.EqualValues(t, exAt[i], where) - } - - for i, v := range absent { - where, present, _ = rc.search(int64(v), nil) - assert.False(t, present) - assert.EqualValues(t, i, where) - } - - // delete the MaxUint16 so we can test - // the behavior when searching near upper limit. - - assert.EqualValues(t, 12, rc.cardinality()) - assert.Equal(t, 12, rc.numIntervals()) - - rc.removeKey(MaxUint16) - - assert.EqualValues(t, 11, rc.cardinality()) - assert.Equal(t, 11, rc.numIntervals()) - - where, present, _ = rc.search(MaxUint16, nil) - - assert.False(t, present) - assert.EqualValues(t, 10, where) - - var numCompares int - where, present, numCompares = rc.search(MaxUint16, nil) - - assert.False(t, present) - assert.EqualValues(t, 10, where) - assert.EqualValues(t, 3, numCompares) - - opts := &searchOptions{ - startIndex: 5, - } - - where, present, numCompares = rc.search(MaxUint16, opts) - - assert.False(t, present) - assert.EqualValues(t, 10, where) - assert.EqualValues(t, 2, numCompares) - - where, present, _ = rc.search(MaxUint16-3, opts) - - assert.True(t, present) - assert.EqualValues(t, 10, where) - - // with the bound in place, MaxUint16-3 should not be found - opts.endxIndex = 10 - where, present, _ = rc.search(MaxUint16-3, opts) - - assert.False(t, present) - assert.EqualValues(t, 9, where) - } - }) -} - func TestRleIntersection16(t *testing.T) { t.Run("RunContainer16.intersect of two RunContainer16(s) should return their intersection", func(t *testing.T) { {