@@ -19,7 +19,7 @@ use crate::unicode;
19
19
//
20
20
// Some of the implementation complexity here is a result of me wanting to
21
21
// preserve the sequential representation without using additional memory.
22
- // In some cases, we do use linear extra memory, but it is at most 2x and it
22
+ // In many cases, we do use linear extra memory, but it is at most 2x and it
23
23
// is amortized. If we relaxed the memory requirements, this implementation
24
24
// could become much simpler. The extra memory is honestly probably OK, but
25
25
// character classes (especially of the Unicode variety) can become quite
@@ -81,45 +81,14 @@ impl<I: Interval> IntervalSet<I> {
81
81
82
82
/// Add a new interval to this set.
83
83
pub fn push ( & mut self , interval : I ) {
84
+ // TODO: This could be faster. e.g., Push the interval such that
85
+ // it preserves canonicalization.
86
+ self . ranges . push ( interval) ;
87
+ self . canonicalize ( ) ;
84
88
// We don't know whether the new interval added here is considered
85
89
// case folded, so we conservatively assume that the entire set is
86
90
// no longer case folded if it was previously.
87
91
self . folded = false ;
88
-
89
- if self . ranges . is_empty ( ) {
90
- self . ranges . push ( interval) ;
91
- return ;
92
- }
93
-
94
- // Find the first range that is not greater than the new interval.
95
- // This is the first range that could possibly be unioned with the
96
- // new interval.
97
- let mut drain_end = self . ranges . len ( ) ;
98
- while drain_end > 0
99
- && self . ranges [ drain_end - 1 ] . lower ( ) > interval. upper ( )
100
- && !self . ranges [ drain_end - 1 ] . is_contiguous ( & interval)
101
- {
102
- drain_end -= 1 ;
103
- }
104
-
105
- // Try to union the new interval with old intervals backwards.
106
- if drain_end > 0 && self . ranges [ drain_end - 1 ] . is_contiguous ( & interval)
107
- {
108
- self . ranges [ drain_end - 1 ] =
109
- self . ranges [ drain_end - 1 ] . union ( & interval) . unwrap ( ) ;
110
- for i in ( 0 ..drain_end - 1 ) . rev ( ) {
111
- if let Some ( union) =
112
- self . ranges [ drain_end - 1 ] . union ( & self . ranges [ i] )
113
- {
114
- self . ranges [ drain_end - 1 ] = union;
115
- } else {
116
- self . ranges . drain ( i + 1 ..drain_end - 1 ) ;
117
- break ;
118
- }
119
- }
120
- } else {
121
- self . ranges . insert ( drain_end, interval) ;
122
- }
123
92
}
124
93
125
94
/// Return an iterator over all intervals in this set.
@@ -223,13 +192,34 @@ impl<I: Interval> IntervalSet<I> {
223
192
// Folks seem to suggest interval or segment trees, but I'd like to
224
193
// avoid the overhead (both runtime and conceptual) of that.
225
194
//
195
+ // The following is basically my Shitty First Draft. Therefore, in
196
+ // order to grok it, you probably need to read each line carefully.
197
+ // Simplifications are most welcome!
198
+ //
226
199
// Remember, we can assume the canonical format invariant here, which
227
200
// says that all ranges are sorted, not overlapping and not adjacent in
228
201
// each class.
229
202
let drain_end = self . ranges . len ( ) ;
203
+ let ( mut a, mut b) = ( 0 , 0 ) ;
204
+ ' LOOP : while a < drain_end && b < other. ranges . len ( ) {
205
+ // Basically, the easy cases are when neither range overlaps with
206
+ // each other. If the `b` range is less than our current `a`
207
+ // range, then we can skip it and move on.
208
+ if other. ranges [ b] . upper ( ) < self . ranges [ a] . lower ( ) {
209
+ b += 1 ;
210
+ continue ;
211
+ }
212
+ // ... similarly for the `a` range. If it's less than the smallest
213
+ // `b` range, then we can add it as-is.
214
+ if self . ranges [ a] . upper ( ) < other. ranges [ b] . lower ( ) {
215
+ let range = self . ranges [ a] ;
216
+ self . ranges . push ( range) ;
217
+ a += 1 ;
218
+ continue ;
219
+ }
220
+ // Otherwise, we have overlapping ranges.
221
+ assert ! ( !self . ranges[ a] . is_intersection_empty( & other. ranges[ b] ) ) ;
230
222
231
- let mut b = 0 ;
232
- for a in 0 ..drain_end {
233
223
// This part is tricky and was non-obvious to me without looking
234
224
// at explicit examples (see the tests). The trickiness stems from
235
225
// two things: 1) subtracting a range from another range could
@@ -241,34 +231,47 @@ impl<I: Interval> IntervalSet<I> {
241
231
// For example, if our `a` range is `a-t` and our next three `b`
242
232
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
243
233
// subtraction three times before moving on to the next `a` range.
244
- self . ranges . push ( self . ranges [ a] ) ;
245
- // Only when `b` is not above `a`, `b` might apply to current
246
- // `a` range.
234
+ let mut range = self . ranges [ a] ;
247
235
while b < other. ranges . len ( )
248
- && other. ranges [ b] . lower ( ) <= self . ranges [ a ] . upper ( )
236
+ && !range . is_intersection_empty ( & other. ranges [ b] )
249
237
{
250
- match self . ranges . pop ( ) . unwrap ( ) . difference ( & other. ranges [ b] ) {
251
- ( Some ( range1) , None ) | ( None , Some ( range1) ) => {
252
- self . ranges . push ( range1) ;
238
+ let old_range = range;
239
+ range = match range. difference ( & other. ranges [ b] ) {
240
+ ( None , None ) => {
241
+ // We lost the entire range, so move on to the next
242
+ // without adding this one.
243
+ a += 1 ;
244
+ continue ' LOOP ;
253
245
}
246
+ ( Some ( range1) , None ) | ( None , Some ( range1) ) => range1,
254
247
( Some ( range1) , Some ( range2) ) => {
255
248
self . ranges . push ( range1) ;
256
- self . ranges . push ( range2) ;
249
+ range2
257
250
}
258
- ( None , None ) => { }
251
+ } ;
252
+ // It's possible that the `b` range has more to contribute
253
+ // here. In particular, if it is greater than the original
254
+ // range, then it might impact the next `a` range *and* it
255
+ // has impacted the current `a` range as much as possible,
256
+ // so we can quit. We don't bump `b` so that the next `a`
257
+ // range can apply it.
258
+ if other. ranges [ b] . upper ( ) > old_range. upper ( ) {
259
+ break ;
259
260
}
260
- // The next `b` range might apply to the current
261
+ // Otherwise, the next `b` range might apply to the current
261
262
// `a` range.
262
263
b += 1 ;
263
264
}
264
- // It's possible that the last `b` range has more to
265
- // contribute to the next `a`. We don't bump the last
266
- // `b` so that the next `a` range can apply it.
267
- b = b. saturating_sub ( 1 ) ;
265
+ self . ranges . push ( range) ;
266
+ a += 1 ;
267
+ }
268
+ while a < drain_end {
269
+ let range = self . ranges [ a] ;
270
+ self . ranges . push ( range) ;
271
+ a += 1 ;
268
272
}
269
-
270
273
self . ranges . drain ( ..drain_end) ;
271
- self . folded = self . ranges . is_empty ( ) || ( self . folded && other. folded ) ;
274
+ self . folded = self . folded && other. folded ;
272
275
}
273
276
274
277
/// Compute the symmetric difference of the two sets, in place.
@@ -279,83 +282,11 @@ impl<I: Interval> IntervalSet<I> {
279
282
/// set. That is, the set will contain all elements in either set,
280
283
/// but will not contain any elements that are in both sets.
281
284
pub fn symmetric_difference ( & mut self , other : & IntervalSet < I > ) {
282
- if self . ranges . is_empty ( ) {
283
- self . ranges . extend ( & other. ranges ) ;
284
- self . folded = other. folded ;
285
- return ;
286
- }
287
- if other. ranges . is_empty ( ) {
288
- return ;
289
- }
290
-
291
- // There should be a way to do this in-place with constant memory,
292
- // but I couldn't figure out a simple way to do it. So just append
293
- // the symmetric difference to the end of this range, and then drain
294
- // it before we're done.
295
- let drain_end = self . ranges . len ( ) ;
296
- let mut b = 0 ;
297
- let mut b_range = Some ( other. ranges [ b] ) ;
298
- for a in 0 ..drain_end {
299
- self . ranges . push ( self . ranges [ a] ) ;
300
- while b_range
301
- . map_or ( false , |r| r. lower ( ) <= self . ranges [ a] . upper ( ) )
302
- {
303
- let ( range1, range2) = match self
304
- . ranges
305
- . pop ( )
306
- . unwrap ( )
307
- . symmetric_difference ( & b_range. as_ref ( ) . unwrap ( ) )
308
- {
309
- ( Some ( range1) , None ) | ( None , Some ( range1) ) => {
310
- ( Some ( range1) , None )
311
- }
312
- ( Some ( range1) , Some ( range2) ) => {
313
- ( Some ( range1) , Some ( range2) )
314
- }
315
- ( None , None ) => ( None , None ) ,
316
- } ;
317
- if let Some ( range) = range1 {
318
- if self . ranges . len ( ) > drain_end
319
- && self . ranges . last ( ) . unwrap ( ) . is_contiguous ( & range)
320
- {
321
- self . ranges
322
- . last_mut ( )
323
- . map ( |last| * last = last. union ( & range) . unwrap ( ) ) ;
324
- } else {
325
- self . ranges . push ( range) ;
326
- }
327
- }
328
- if let Some ( range) = range2 {
329
- self . ranges . push ( range) ;
330
- }
331
-
332
- b_range = if self . ranges . len ( ) > drain_end
333
- && self . ranges . last ( ) . unwrap ( ) . upper ( )
334
- > self . ranges [ a] . upper ( )
335
- {
336
- Some ( * self . ranges . last ( ) . unwrap ( ) )
337
- } else {
338
- b += 1 ;
339
- other. ranges . get ( b) . cloned ( )
340
- } ;
341
- }
342
- }
343
- while let Some ( range) = b_range {
344
- if self . ranges . len ( ) > drain_end
345
- && self . ranges . last ( ) . unwrap ( ) . is_contiguous ( & range)
346
- {
347
- self . ranges
348
- . last_mut ( )
349
- . map ( |last| * last = last. union ( & range) . unwrap ( ) ) ;
350
- } else {
351
- self . ranges . push ( range) ;
352
- }
353
- b += 1 ;
354
- b_range = other. ranges . get ( b) . cloned ( ) ;
355
- }
356
-
357
- self . ranges . drain ( ..drain_end) ;
358
- self . folded = self . ranges . is_empty ( ) || ( self . folded && other. folded ) ;
285
+ // TODO(burntsushi): Fix this so that it amortizes allocation.
286
+ let mut intersection = self . clone ( ) ;
287
+ intersection. intersect ( other) ;
288
+ self . union ( other) ;
289
+ self . difference ( & intersection) ;
359
290
}
360
291
361
292
/// Negate this interval set.
@@ -371,44 +302,28 @@ impl<I: Interval> IntervalSet<I> {
371
302
return ;
372
303
}
373
304
305
+ // There should be a way to do this in-place with constant memory,
306
+ // but I couldn't figure out a simple way to do it. So just append
307
+ // the negation to the end of this range, and then drain it before
308
+ // we're done.
309
+ let drain_end = self . ranges . len ( ) ;
310
+
374
311
// We do checked arithmetic below because of the canonical ordering
375
312
// invariant.
376
313
if self . ranges [ 0 ] . lower ( ) > I :: Bound :: min_value ( ) {
377
- let mut pre_upper = self . ranges [ 0 ] . upper ( ) ;
378
- self . ranges [ 0 ] = I :: create (
379
- I :: Bound :: min_value ( ) ,
380
- self . ranges [ 0 ] . lower ( ) . decrement ( ) ,
381
- ) ;
382
- for i in 1 ..self . ranges . len ( ) {
383
- let lower = pre_upper. increment ( ) ;
384
- pre_upper = self . ranges [ i] . upper ( ) ;
385
- self . ranges [ i] =
386
- I :: create ( lower, self . ranges [ i] . lower ( ) . decrement ( ) ) ;
387
- }
388
- if pre_upper < I :: Bound :: max_value ( ) {
389
- self . ranges . push ( I :: create (
390
- pre_upper. increment ( ) ,
391
- I :: Bound :: max_value ( ) ,
392
- ) ) ;
393
- }
394
- } else {
395
- for i in 1 ..self . ranges . len ( ) {
396
- self . ranges [ i - 1 ] = I :: create (
397
- self . ranges [ i - 1 ] . upper ( ) . increment ( ) ,
398
- self . ranges [ i] . lower ( ) . decrement ( ) ,
399
- ) ;
400
- }
401
- if self . ranges . last ( ) . unwrap ( ) . upper ( ) < I :: Bound :: max_value ( ) {
402
- self . ranges . last_mut ( ) . map ( |range| {
403
- * range = I :: create (
404
- range. upper ( ) . increment ( ) ,
405
- I :: Bound :: max_value ( ) ,
406
- )
407
- } ) ;
408
- } else {
409
- self . ranges . pop ( ) ;
410
- }
314
+ let upper = self . ranges [ 0 ] . lower ( ) . decrement ( ) ;
315
+ self . ranges . push ( I :: create ( I :: Bound :: min_value ( ) , upper) ) ;
316
+ }
317
+ for i in 1 ..drain_end {
318
+ let lower = self . ranges [ i - 1 ] . upper ( ) . increment ( ) ;
319
+ let upper = self . ranges [ i] . lower ( ) . decrement ( ) ;
320
+ self . ranges . push ( I :: create ( lower, upper) ) ;
321
+ }
322
+ if self . ranges [ drain_end - 1 ] . upper ( ) < I :: Bound :: max_value ( ) {
323
+ let lower = self . ranges [ drain_end - 1 ] . upper ( ) . increment ( ) ;
324
+ self . ranges . push ( I :: create ( lower, I :: Bound :: max_value ( ) ) ) ;
411
325
}
326
+ self . ranges . drain ( ..drain_end) ;
412
327
// We don't need to update whether this set is folded or not, because
413
328
// it is conservatively preserved through negation. Namely, if a set
414
329
// is not folded, then it is possible that its negation is folded, for
@@ -422,7 +337,6 @@ impl<I: Interval> IntervalSet<I> {
422
337
// of case folded characters. Negating it in turn means that all
423
338
// equivalence classes in the set are negated, and any equivalence
424
339
// class that was previously not in the set is now entirely in the set.
425
- self . folded = self . ranges . is_empty ( ) || self . folded ;
426
340
}
427
341
428
342
/// Converts this set into a canonical ordering.
@@ -433,20 +347,24 @@ impl<I: Interval> IntervalSet<I> {
433
347
self . ranges . sort ( ) ;
434
348
assert ! ( !self . ranges. is_empty( ) ) ;
435
349
436
- // We maintain the canonicalization results in-place at `0..newi`.
437
- // `newi` will keep track of the end of the canonicalized ranges.
438
- let mut newi = 0 ;
439
- for oldi in 1 ..self . ranges . len ( ) {
440
- // The last new range gets merged with currnet old range when
441
- // unionable. If not, we update `newi` and store it as a new range.
442
- if let Some ( union) = self . ranges [ newi] . union ( & self . ranges [ oldi] ) {
443
- self . ranges [ newi] = union;
444
- } else {
445
- newi += 1 ;
446
- self . ranges [ newi] = self . ranges [ oldi] ;
350
+ // Is there a way to do this in-place with constant memory? I couldn't
351
+ // figure out a way to do it. So just append the canonicalization to
352
+ // the end of this range, and then drain it before we're done.
353
+ let drain_end = self . ranges . len ( ) ;
354
+ for oldi in 0 ..drain_end {
355
+ // If we've added at least one new range, then check if we can
356
+ // merge this range in the previously added range.
357
+ if self . ranges . len ( ) > drain_end {
358
+ let ( last, rest) = self . ranges . split_last_mut ( ) . unwrap ( ) ;
359
+ if let Some ( union) = last. union ( & rest[ oldi] ) {
360
+ * last = union;
361
+ continue ;
362
+ }
447
363
}
364
+ let range = self . ranges [ oldi] ;
365
+ self . ranges . push ( range) ;
448
366
}
449
- self . ranges . truncate ( newi + 1 ) ;
367
+ self . ranges . drain ( ..drain_end ) ;
450
368
}
451
369
452
370
/// Returns true if and only if this class is in a canonical ordering.
@@ -568,13 +486,7 @@ pub trait Interval:
568
486
other : & Self ,
569
487
) -> ( Option < Self > , Option < Self > ) {
570
488
let union = match self . union ( other) {
571
- None => {
572
- return if self . upper ( ) < other. lower ( ) {
573
- ( Some ( self . clone ( ) ) , Some ( other. clone ( ) ) )
574
- } else {
575
- ( Some ( other. clone ( ) ) , Some ( self . clone ( ) ) )
576
- }
577
- }
489
+ None => return ( Some ( self . clone ( ) ) , Some ( other. clone ( ) ) ) ,
578
490
Some ( union) => union,
579
491
} ;
580
492
let intersection = match self . intersect ( other) {
0 commit comments