Skip to content

Commit f082244

Browse files
committed
syntax: revert interval set optimizations
This reverts commit 6d2b09e. Sadly I just don't have the time to fix this code myself. It's too subtle. So I'm just reverting it entirely for now. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 Ref #1051
1 parent d5144b2 commit f082244

File tree

1 file changed

+97
-185
lines changed

1 file changed

+97
-185
lines changed

regex-syntax/src/hir/interval.rs

+97-185
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::unicode;
1919
//
2020
// Some of the implementation complexity here is a result of me wanting to
2121
// preserve the sequential representation without using additional memory.
22-
// In some cases, we do use linear extra memory, but it is at most 2x and it
22+
// In many cases, we do use linear extra memory, but it is at most 2x and it
2323
// is amortized. If we relaxed the memory requirements, this implementation
2424
// could become much simpler. The extra memory is honestly probably OK, but
2525
// character classes (especially of the Unicode variety) can become quite
@@ -81,45 +81,14 @@ impl<I: Interval> IntervalSet<I> {
8181

8282
/// Add a new interval to this set.
8383
pub fn push(&mut self, interval: I) {
84+
// TODO: This could be faster. e.g., Push the interval such that
85+
// it preserves canonicalization.
86+
self.ranges.push(interval);
87+
self.canonicalize();
8488
// We don't know whether the new interval added here is considered
8589
// case folded, so we conservatively assume that the entire set is
8690
// no longer case folded if it was previously.
8791
self.folded = false;
88-
89-
if self.ranges.is_empty() {
90-
self.ranges.push(interval);
91-
return;
92-
}
93-
94-
// Find the first range that is not greater than the new interval.
95-
// This is the first range that could possibly be unioned with the
96-
// new interval.
97-
let mut drain_end = self.ranges.len();
98-
while drain_end > 0
99-
&& self.ranges[drain_end - 1].lower() > interval.upper()
100-
&& !self.ranges[drain_end - 1].is_contiguous(&interval)
101-
{
102-
drain_end -= 1;
103-
}
104-
105-
// Try to union the new interval with old intervals backwards.
106-
if drain_end > 0 && self.ranges[drain_end - 1].is_contiguous(&interval)
107-
{
108-
self.ranges[drain_end - 1] =
109-
self.ranges[drain_end - 1].union(&interval).unwrap();
110-
for i in (0..drain_end - 1).rev() {
111-
if let Some(union) =
112-
self.ranges[drain_end - 1].union(&self.ranges[i])
113-
{
114-
self.ranges[drain_end - 1] = union;
115-
} else {
116-
self.ranges.drain(i + 1..drain_end - 1);
117-
break;
118-
}
119-
}
120-
} else {
121-
self.ranges.insert(drain_end, interval);
122-
}
12392
}
12493

12594
/// Return an iterator over all intervals in this set.
@@ -223,13 +192,34 @@ impl<I: Interval> IntervalSet<I> {
223192
// Folks seem to suggest interval or segment trees, but I'd like to
224193
// avoid the overhead (both runtime and conceptual) of that.
225194
//
195+
// The following is basically my Shitty First Draft. Therefore, in
196+
// order to grok it, you probably need to read each line carefully.
197+
// Simplifications are most welcome!
198+
//
226199
// Remember, we can assume the canonical format invariant here, which
227200
// says that all ranges are sorted, not overlapping and not adjacent in
228201
// each class.
229202
let drain_end = self.ranges.len();
203+
let (mut a, mut b) = (0, 0);
204+
'LOOP: while a < drain_end && b < other.ranges.len() {
205+
// Basically, the easy cases are when neither range overlaps with
206+
// each other. If the `b` range is less than our current `a`
207+
// range, then we can skip it and move on.
208+
if other.ranges[b].upper() < self.ranges[a].lower() {
209+
b += 1;
210+
continue;
211+
}
212+
// ... similarly for the `a` range. If it's less than the smallest
213+
// `b` range, then we can add it as-is.
214+
if self.ranges[a].upper() < other.ranges[b].lower() {
215+
let range = self.ranges[a];
216+
self.ranges.push(range);
217+
a += 1;
218+
continue;
219+
}
220+
// Otherwise, we have overlapping ranges.
221+
assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
230222

231-
let mut b = 0;
232-
for a in 0..drain_end {
233223
// This part is tricky and was non-obvious to me without looking
234224
// at explicit examples (see the tests). The trickiness stems from
235225
// two things: 1) subtracting a range from another range could
@@ -241,34 +231,47 @@ impl<I: Interval> IntervalSet<I> {
241231
// For example, if our `a` range is `a-t` and our next three `b`
242232
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
243233
// subtraction three times before moving on to the next `a` range.
244-
self.ranges.push(self.ranges[a]);
245-
// Only when `b` is not above `a`, `b` might apply to current
246-
// `a` range.
234+
let mut range = self.ranges[a];
247235
while b < other.ranges.len()
248-
&& other.ranges[b].lower() <= self.ranges[a].upper()
236+
&& !range.is_intersection_empty(&other.ranges[b])
249237
{
250-
match self.ranges.pop().unwrap().difference(&other.ranges[b]) {
251-
(Some(range1), None) | (None, Some(range1)) => {
252-
self.ranges.push(range1);
238+
let old_range = range;
239+
range = match range.difference(&other.ranges[b]) {
240+
(None, None) => {
241+
// We lost the entire range, so move on to the next
242+
// without adding this one.
243+
a += 1;
244+
continue 'LOOP;
253245
}
246+
(Some(range1), None) | (None, Some(range1)) => range1,
254247
(Some(range1), Some(range2)) => {
255248
self.ranges.push(range1);
256-
self.ranges.push(range2);
249+
range2
257250
}
258-
(None, None) => {}
251+
};
252+
// It's possible that the `b` range has more to contribute
253+
// here. In particular, if it is greater than the original
254+
// range, then it might impact the next `a` range *and* it
255+
// has impacted the current `a` range as much as possible,
256+
// so we can quit. We don't bump `b` so that the next `a`
257+
// range can apply it.
258+
if other.ranges[b].upper() > old_range.upper() {
259+
break;
259260
}
260-
// The next `b` range might apply to the current
261+
// Otherwise, the next `b` range might apply to the current
261262
// `a` range.
262263
b += 1;
263264
}
264-
// It's possible that the last `b` range has more to
265-
// contribute to the next `a`. We don't bump the last
266-
// `b` so that the next `a` range can apply it.
267-
b = b.saturating_sub(1);
265+
self.ranges.push(range);
266+
a += 1;
267+
}
268+
while a < drain_end {
269+
let range = self.ranges[a];
270+
self.ranges.push(range);
271+
a += 1;
268272
}
269-
270273
self.ranges.drain(..drain_end);
271-
self.folded = self.ranges.is_empty() || (self.folded && other.folded);
274+
self.folded = self.folded && other.folded;
272275
}
273276

274277
/// Compute the symmetric difference of the two sets, in place.
@@ -279,83 +282,11 @@ impl<I: Interval> IntervalSet<I> {
279282
/// set. That is, the set will contain all elements in either set,
280283
/// but will not contain any elements that are in both sets.
281284
pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
282-
if self.ranges.is_empty() {
283-
self.ranges.extend(&other.ranges);
284-
self.folded = other.folded;
285-
return;
286-
}
287-
if other.ranges.is_empty() {
288-
return;
289-
}
290-
291-
// There should be a way to do this in-place with constant memory,
292-
// but I couldn't figure out a simple way to do it. So just append
293-
// the symmetric difference to the end of this range, and then drain
294-
// it before we're done.
295-
let drain_end = self.ranges.len();
296-
let mut b = 0;
297-
let mut b_range = Some(other.ranges[b]);
298-
for a in 0..drain_end {
299-
self.ranges.push(self.ranges[a]);
300-
while b_range
301-
.map_or(false, |r| r.lower() <= self.ranges[a].upper())
302-
{
303-
let (range1, range2) = match self
304-
.ranges
305-
.pop()
306-
.unwrap()
307-
.symmetric_difference(&b_range.as_ref().unwrap())
308-
{
309-
(Some(range1), None) | (None, Some(range1)) => {
310-
(Some(range1), None)
311-
}
312-
(Some(range1), Some(range2)) => {
313-
(Some(range1), Some(range2))
314-
}
315-
(None, None) => (None, None),
316-
};
317-
if let Some(range) = range1 {
318-
if self.ranges.len() > drain_end
319-
&& self.ranges.last().unwrap().is_contiguous(&range)
320-
{
321-
self.ranges
322-
.last_mut()
323-
.map(|last| *last = last.union(&range).unwrap());
324-
} else {
325-
self.ranges.push(range);
326-
}
327-
}
328-
if let Some(range) = range2 {
329-
self.ranges.push(range);
330-
}
331-
332-
b_range = if self.ranges.len() > drain_end
333-
&& self.ranges.last().unwrap().upper()
334-
> self.ranges[a].upper()
335-
{
336-
Some(*self.ranges.last().unwrap())
337-
} else {
338-
b += 1;
339-
other.ranges.get(b).cloned()
340-
};
341-
}
342-
}
343-
while let Some(range) = b_range {
344-
if self.ranges.len() > drain_end
345-
&& self.ranges.last().unwrap().is_contiguous(&range)
346-
{
347-
self.ranges
348-
.last_mut()
349-
.map(|last| *last = last.union(&range).unwrap());
350-
} else {
351-
self.ranges.push(range);
352-
}
353-
b += 1;
354-
b_range = other.ranges.get(b).cloned();
355-
}
356-
357-
self.ranges.drain(..drain_end);
358-
self.folded = self.ranges.is_empty() || (self.folded && other.folded);
285+
// TODO(burntsushi): Fix this so that it amortizes allocation.
286+
let mut intersection = self.clone();
287+
intersection.intersect(other);
288+
self.union(other);
289+
self.difference(&intersection);
359290
}
360291

361292
/// Negate this interval set.
@@ -371,44 +302,28 @@ impl<I: Interval> IntervalSet<I> {
371302
return;
372303
}
373304

305+
// There should be a way to do this in-place with constant memory,
306+
// but I couldn't figure out a simple way to do it. So just append
307+
// the negation to the end of this range, and then drain it before
308+
// we're done.
309+
let drain_end = self.ranges.len();
310+
374311
// We do checked arithmetic below because of the canonical ordering
375312
// invariant.
376313
if self.ranges[0].lower() > I::Bound::min_value() {
377-
let mut pre_upper = self.ranges[0].upper();
378-
self.ranges[0] = I::create(
379-
I::Bound::min_value(),
380-
self.ranges[0].lower().decrement(),
381-
);
382-
for i in 1..self.ranges.len() {
383-
let lower = pre_upper.increment();
384-
pre_upper = self.ranges[i].upper();
385-
self.ranges[i] =
386-
I::create(lower, self.ranges[i].lower().decrement());
387-
}
388-
if pre_upper < I::Bound::max_value() {
389-
self.ranges.push(I::create(
390-
pre_upper.increment(),
391-
I::Bound::max_value(),
392-
));
393-
}
394-
} else {
395-
for i in 1..self.ranges.len() {
396-
self.ranges[i - 1] = I::create(
397-
self.ranges[i - 1].upper().increment(),
398-
self.ranges[i].lower().decrement(),
399-
);
400-
}
401-
if self.ranges.last().unwrap().upper() < I::Bound::max_value() {
402-
self.ranges.last_mut().map(|range| {
403-
*range = I::create(
404-
range.upper().increment(),
405-
I::Bound::max_value(),
406-
)
407-
});
408-
} else {
409-
self.ranges.pop();
410-
}
314+
let upper = self.ranges[0].lower().decrement();
315+
self.ranges.push(I::create(I::Bound::min_value(), upper));
316+
}
317+
for i in 1..drain_end {
318+
let lower = self.ranges[i - 1].upper().increment();
319+
let upper = self.ranges[i].lower().decrement();
320+
self.ranges.push(I::create(lower, upper));
321+
}
322+
if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
323+
let lower = self.ranges[drain_end - 1].upper().increment();
324+
self.ranges.push(I::create(lower, I::Bound::max_value()));
411325
}
326+
self.ranges.drain(..drain_end);
412327
// We don't need to update whether this set is folded or not, because
413328
// it is conservatively preserved through negation. Namely, if a set
414329
// is not folded, then it is possible that its negation is folded, for
@@ -422,7 +337,6 @@ impl<I: Interval> IntervalSet<I> {
422337
// of case folded characters. Negating it in turn means that all
423338
// equivalence classes in the set are negated, and any equivalence
424339
// class that was previously not in the set is now entirely in the set.
425-
self.folded = self.ranges.is_empty() || self.folded;
426340
}
427341

428342
/// Converts this set into a canonical ordering.
@@ -433,20 +347,24 @@ impl<I: Interval> IntervalSet<I> {
433347
self.ranges.sort();
434348
assert!(!self.ranges.is_empty());
435349

436-
// We maintain the canonicalization results in-place at `0..newi`.
437-
// `newi` will keep track of the end of the canonicalized ranges.
438-
let mut newi = 0;
439-
for oldi in 1..self.ranges.len() {
440-
// The last new range gets merged with currnet old range when
441-
// unionable. If not, we update `newi` and store it as a new range.
442-
if let Some(union) = self.ranges[newi].union(&self.ranges[oldi]) {
443-
self.ranges[newi] = union;
444-
} else {
445-
newi += 1;
446-
self.ranges[newi] = self.ranges[oldi];
350+
// Is there a way to do this in-place with constant memory? I couldn't
351+
// figure out a way to do it. So just append the canonicalization to
352+
// the end of this range, and then drain it before we're done.
353+
let drain_end = self.ranges.len();
354+
for oldi in 0..drain_end {
355+
// If we've added at least one new range, then check if we can
356+
// merge this range in the previously added range.
357+
if self.ranges.len() > drain_end {
358+
let (last, rest) = self.ranges.split_last_mut().unwrap();
359+
if let Some(union) = last.union(&rest[oldi]) {
360+
*last = union;
361+
continue;
362+
}
447363
}
364+
let range = self.ranges[oldi];
365+
self.ranges.push(range);
448366
}
449-
self.ranges.truncate(newi + 1);
367+
self.ranges.drain(..drain_end);
450368
}
451369

452370
/// Returns true if and only if this class is in a canonical ordering.
@@ -568,13 +486,7 @@ pub trait Interval:
568486
other: &Self,
569487
) -> (Option<Self>, Option<Self>) {
570488
let union = match self.union(other) {
571-
None => {
572-
return if self.upper() < other.lower() {
573-
(Some(self.clone()), Some(other.clone()))
574-
} else {
575-
(Some(other.clone()), Some(self.clone()))
576-
}
577-
}
489+
None => return (Some(self.clone()), Some(other.clone())),
578490
Some(union) => union,
579491
};
580492
let intersection = match self.intersect(other) {

0 commit comments

Comments
 (0)