Skip to content

Commit

Permalink
LUCENE-4078: PatternReplaceCharFilter assertion error caused by malfo…
Browse files Browse the repository at this point in the history
…rmed

utf-16. This is most likely a bug in the JDK because a
Pattern("").replaceAll("x") is inserted in between surrogate pair
characters and corrupts strings.

A temporary fix is to check for this at random pattern generator and pick
again if detected.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1343214 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
dweiss committed May 28, 2012
1 parent 02e4083 commit aff97ab
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
Expand Down Expand Up @@ -306,7 +305,7 @@ public void testRandomStrings() throws Exception {
long maxTime = 1000 * 2;
Random random = new Random(random().nextLong());
for (int i = 0; i < numPatterns && start + maxTime > System.currentTimeMillis(); i++) {
final Pattern p = randomPattern();
final Pattern p = _TestUtil.randomPattern(random());
final String replacement = _TestUtil.randomSimpleString(random);
Analyzer a = new Analyzer() {
@Override
Expand All @@ -325,14 +324,4 @@ protected Reader initReader(Reader reader) {
* time for certain patterns. */ 40, true); // only ascii
}
}

public Pattern randomPattern() {
while (true) {
try {
return Pattern.compile(_TestUtil.randomRegexpishString(random()));
} catch (PatternSyntaxException ignored) {
// if at first you don't succeed...
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

Expand Down Expand Up @@ -900,4 +902,24 @@ public static void shutdownExecutorService(ExecutorService ex) {
}
}
}

/**
* Returns a valid (compiling) Pattern instance with random stuff inside. Be careful
* when applying random patterns to longer strings as certain types of patterns
* may explode into exponential times in backtracking implementations (such as Java's).
*/
public static Pattern randomPattern(Random random) {
final String nonBmpString = "AB\uD840\uDC00C";
while (true) {
try {
Pattern p = Pattern.compile(_TestUtil.randomRegexpishString(random));
// Make sure the result of applying the pattern to a string with extended
// unicode characters is a valid utf16 string. See LUCENE-4078 for discussion.
if (UnicodeUtil.validUTF16String(p.matcher(nonBmpString).replaceAll("_")))
return p;
} catch (PatternSyntaxException ignored) {
// Loop trying until we hit something that compiles.
}
}
}
}

0 comments on commit aff97ab

Please sign in to comment.