Skip to content

Commit

Permalink
[GR-48806] Support for atomic groups and possessive quantifiers in Py…
Browse files Browse the repository at this point in the history
…thon regexps.

PullRequest: graal/15754
  • Loading branch information
jirkamarsik committed Nov 9, 2023
2 parents 24e37a5 + a8f641c commit 21fb0bb
Show file tree
Hide file tree
Showing 12 changed files with 239 additions and 143 deletions.
4 changes: 4 additions & 0 deletions regex/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

This changelog summarizes major changes between TRegex versions relevant to language implementors integrating TRegex into their language. This document will focus on API changes relevant to integrators of TRegex.

## Version 24.0.0

* Added support for atomic groups and possessive quantifiers in Python regular expressions.

## Version 23.1.0

* Added support for Unicode sets mode (`v` flag) in ECMAScript regular expressions.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -351,37 +351,31 @@ public void testInlineGlobalFlagsInComments() {
// ...when the verbose flag is passed to re.compile,
test("#(?i)\nfoo", "x", "foo", 0, true, 0, 3, -1);
test("#(?i)\nfoo", "x", "FOO", 0, false);
// when the verbose flag is set inline, either before or after,
// ...when the verbose flag is set inline,
test("(?x)#(?i)\nfoo", "", "foo", 0, true, 0, 3, -1);
test("(?x)#(?i)\nfoo", "", "FOO", 0, false);
test("#(?i)\n(?x)foo", "", "foo", 0, true, 0, 3, -1);
test("#(?i)\n(?x)foo", "", "FOO", 0, false);
// and when the verbose flag is set in a local group.
test("(?x:#(?i)\n)foo", "", "foo", 0, true, 0, 3, -1);
test("(?x:#(?i)\n)foo", "", "FOO", 0, false);
// The (?x) inline flag can be hidden in a comment.
test("#(?x)(?i)\nfoo", "", "foo", 0, true, 0, 3, -1);
test("#(?x)(?i)\nfoo", "", "FOO", 0, false);

// Comments should be disabled in (?-x:...) blocks, inline flags within should be respected.
test("(?x:(?-x:#(?i)\n))foo", "", "#\nfoo", 0, true, 0, 5, -1);
test("(?x:(?-x:#(?i)\n))foo", "", "#\nFOO", 0, true, 0, 5, -1);
// This throws an internal exception inside CPython's sre due to a bug, but it should work.
test("(?-x:#(?i)\n)foo", "x", "#\nfoo", 0, true, 0, 5, -1);
test("(?-x:#(?i)\n)foo", "x", "#\nFOO", 0, true, 0, 5, -1);
test("(?x)(?-x:#(?i)\n)foo", "", "#\nfoo", 0, true, 0, 5, -1);
test("(?x)(?-x:#(?i)\n)foo", "", "#\nFOO", 0, true, 0, 5, -1);

test("(?##)(?i)(?#\n)foo", "x", "foo", 0, true, 0, 3, -1);
test("(?##)(?i)(?#\n)foo", "x", "FOO", 0, true, 0, 3, -1);

test("(?#[)(?i)(?#])foo", "", "foo", 0, true, 0, 3, -1);
test("(?#[)(?i)(?#])foo", "", "FOO", 0, true, 0, 3, -1);

// NB: The verbose flag can no longer be set inline in the middle of a regexp.
expectSyntaxError("#(?i)\n(?x)foo", "", "global flags not at the start of the expression", 1);
expectSyntaxError("#(?x)(?i)\nfoo", "", "global flags not at the start of the expression", 1);
expectSyntaxError("(?x:(?-x:#(?i)\n))foo", "", "global flags not at the start of the expression", 10);
expectSyntaxError("(?-x:#(?i)\n)foo", "x", "global flags not at the start of the expression", 6);
expectSyntaxError("(?x)(?-x:#(?i)\n)foo", "", "global flags not at the start of the expression", 10);
}

@Test
public void testInlineGlobalFlagsEscaped() {
test("\\\\(?i)foo", "", "\\FOO", 0, true, 0, 4, -1);
// NB: The verbose flag can no longer be set inline in the middle of a regexp.
expectSyntaxError("\\\\(?i)foo", "", "global flags not at the start of the expression", 2);
}

@Test
Expand Down Expand Up @@ -509,6 +503,67 @@ public void testLazyLastGroup() {
}
}

@Test
public void testCasefixEquivalences() {
// Generated using re._casefix._EXTRA_CASES from CPython 3.11.4
test("\u0069", "i", "\u0131", 0, true, 0, 1);
test("\u0073", "i", "\u017f", 0, true, 0, 1);
test("\u00b5", "i", "\u03bc", 0, true, 0, 1);
test("\u0131", "i", "\u0069", 0, true, 0, 1);
test("\u017f", "i", "\u0073", 0, true, 0, 1);
test("\u0345", "i", "\u03b9", 0, true, 0, 1);
test("\u0345", "i", "\u1fbe", 0, true, 0, 1);
test("\u0390", "i", "\u1fd3", 0, true, 0, 1);
test("\u03b0", "i", "\u1fe3", 0, true, 0, 1);
test("\u03b2", "i", "\u03d0", 0, true, 0, 1);
test("\u03b5", "i", "\u03f5", 0, true, 0, 1);
test("\u03b8", "i", "\u03d1", 0, true, 0, 1);
test("\u03b9", "i", "\u0345", 0, true, 0, 1);
test("\u03b9", "i", "\u1fbe", 0, true, 0, 1);
test("\u03ba", "i", "\u03f0", 0, true, 0, 1);
test("\u03bc", "i", "\u00b5", 0, true, 0, 1);
test("\u03c0", "i", "\u03d6", 0, true, 0, 1);
test("\u03c1", "i", "\u03f1", 0, true, 0, 1);
test("\u03c2", "i", "\u03c3", 0, true, 0, 1);
test("\u03c3", "i", "\u03c2", 0, true, 0, 1);
test("\u03c6", "i", "\u03d5", 0, true, 0, 1);
test("\u03d0", "i", "\u03b2", 0, true, 0, 1);
test("\u03d1", "i", "\u03b8", 0, true, 0, 1);
test("\u03d5", "i", "\u03c6", 0, true, 0, 1);
test("\u03d6", "i", "\u03c0", 0, true, 0, 1);
test("\u03f0", "i", "\u03ba", 0, true, 0, 1);
test("\u03f1", "i", "\u03c1", 0, true, 0, 1);
test("\u03f5", "i", "\u03b5", 0, true, 0, 1);
test("\u0432", "i", "\u1c80", 0, true, 0, 1);
test("\u0434", "i", "\u1c81", 0, true, 0, 1);
test("\u043e", "i", "\u1c82", 0, true, 0, 1);
test("\u0441", "i", "\u1c83", 0, true, 0, 1);
test("\u0442", "i", "\u1c84", 0, true, 0, 1);
test("\u0442", "i", "\u1c85", 0, true, 0, 1);
test("\u044a", "i", "\u1c86", 0, true, 0, 1);
test("\u0463", "i", "\u1c87", 0, true, 0, 1);
test("\u1c80", "i", "\u0432", 0, true, 0, 1);
test("\u1c81", "i", "\u0434", 0, true, 0, 1);
test("\u1c82", "i", "\u043e", 0, true, 0, 1);
test("\u1c83", "i", "\u0441", 0, true, 0, 1);
test("\u1c84", "i", "\u0442", 0, true, 0, 1);
test("\u1c84", "i", "\u1c85", 0, true, 0, 1);
test("\u1c85", "i", "\u0442", 0, true, 0, 1);
test("\u1c85", "i", "\u1c84", 0, true, 0, 1);
test("\u1c86", "i", "\u044a", 0, true, 0, 1);
test("\u1c87", "i", "\u0463", 0, true, 0, 1);
test("\u1c88", "i", "\ua64b", 0, true, 0, 1);
test("\u1e61", "i", "\u1e9b", 0, true, 0, 1);
test("\u1e9b", "i", "\u1e61", 0, true, 0, 1);
test("\u1fbe", "i", "\u0345", 0, true, 0, 1);
test("\u1fbe", "i", "\u03b9", 0, true, 0, 1);
test("\u1fd3", "i", "\u0390", 0, true, 0, 1);
test("\u1fe3", "i", "\u03b0", 0, true, 0, 1);
test("\ua64b", "i", "\u1c88", 0, true, 0, 1);
test("\ufb05", "i", "\ufb06", 0, true, 0, 1);
test("\ufb06", "i", "\ufb05", 0, true, 0, 1);
}

@Test
public void testSyntaxErrors() {
// Generated using sre from CPython 3.10.8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
package com.oracle.truffle.regex.errors;

import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.regex.tregex.parser.flavors.PythonREMode;

import java.util.regex.Pattern;

public interface PyErrorMessages {

Expand Down Expand Up @@ -72,6 +75,7 @@ public interface PyErrorMessages {
String UNTERMINATED_NAME = "missing ), unterminated name";
String UNTERMINATED_NAME_ANGLE_BRACKET = "missing >, unterminated name";
String UNTERMINATED_SUBPATTERN = "missing ), unterminated subpattern";
String GLOBAL_FLAGS_NOT_AT_START = "global flags not at the start of the expression";

@TruffleBoundary
static String badCharacterInGroupName(String name) {
Expand Down Expand Up @@ -143,8 +147,29 @@ static String unknownExtensionQ(int chr) {
return "unknown extension ?" + new String(Character.toChars(chr));
}

Pattern NON_ASCII_CHARACTERS = Pattern.compile("\\P{ASCII}");
Pattern NON_PRINTABLE_CHARACTERS = Pattern.compile("\\P{Print}", Pattern.UNICODE_CHARACTER_CLASS);

@TruffleBoundary
private static String repr(String str, PythonREMode mode) {
Pattern charsToReplace = switch (mode) {
case Bytes -> NON_ASCII_CHARACTERS;
case Str -> NON_PRINTABLE_CHARACTERS;
};
return charsToReplace.matcher(str).replaceAll(res -> {
int cp = res.group().codePointAt(0);
if (cp <= 0xff) {
return String.format("\\\\x%02x", cp);
} else if (cp <= 0xffff) {
return String.format("\\\\u%04x", cp);
} else {
return String.format("\\\\U%08x", cp);
}
});
}

@TruffleBoundary
static String unknownGroupName(String name) {
return "unknown group name '" + name + "'";
static String unknownGroupName(String name, PythonREMode mode) {
return "unknown group name '" + repr(name, mode) + "'";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ protected boolean featureEnabledBoundedQuantifierEmptyMin() {
return false;
}

@Override
protected boolean featureEnabledPossessiveQuantifiers() {
return false;
}

@Override
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -714,20 +714,32 @@ public void addQuantifier(Token.Quantifier quantifier) {
} else {
if (quantifier.getMin() == 0 && (curTerm.isLookAroundAssertion() || curTermIsZeroWidthGroup ||
curTerm.isCharacterClass() && curTerm.asCharacterClass().getCharSet().matchesNothing())) {
// NB: If JavaScript ever gets possessive quantifiers, we might have to adjust this.
removeCurTerm();
return;
}
}
if (quantifier.getMin() > 0 && (curTerm.isLookAroundAssertion() || curTermIsZeroWidthGroup)) {
// quantifying LookAroundAssertions doesn't do anything if quantifier.getMin() > 0, so
// ignore.
// Quantifying LookAroundAssertions doesn't do anything if quantifier.getMin() > 0, so
// ignore. A possessive quantifier would still result in atomicity.
if (quantifier.isPossessive()) {
wrapCurTermInAtomicGroup();
}
return;
}
if (quantifier.getMin() == 1 && quantifier.getMax() == 1) {
// x{1,1} -> x
if (quantifier.isPossessive()) {
wrapCurTermInAtomicGroup();
}
return;
}
curTerm = addQuantifier(curTerm, quantifier);
if (quantifier.isPossessive()) {
wrapCurTermInAtomicGroup();
// do not attempt to merge quantifiers when possessive quantifiers are present
return;
}
// merge equal successive quantified terms
if (curSequence.size() > 1) {
Term prevTerm = curSequence.getTerms().get(curSequence.size() - 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ public RegexLexer(RegexSource source, CompilationBuffer compilationBuffer) {
*/
protected abstract boolean featureEnabledBoundedQuantifierEmptyMin();

/**
* Returns {@code true} if possessive quantifiers ({@code +} suffix) are allowed.
*/
protected abstract boolean featureEnabledPossessiveQuantifiers();

/**
* Returns {@code true} if the first character in a character class must be interpreted as part
* of the character set, even if it is the closing bracket {@code ']'}.
Expand Down Expand Up @@ -1012,7 +1017,14 @@ private Token parseQuantifier(char c) throws RegexSyntaxException {
min = c == '+' ? 1 : 0;
max = c == '?' ? 1 : -1;
}
return Token.createQuantifier((int) min, (int) max, !consumingLookahead("?"));
boolean greedy = true;
boolean possessive = false;
if (consumingLookahead('?')) {
greedy = false;
} else if (featureEnabledPossessiveQuantifiers() && consumingLookahead('+')) {
possessive = true;
}
return Token.createQuantifier((int) min, (int) max, greedy, possessive);
}

private boolean isQuantifierOutOfOrder(long parsedMin, long parsedMax, int startMin, int lengthMin, int lengthMax) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public enum Kind {
alternation,
captureGroupBegin,
nonCaptureGroupBegin,
atomicGroupBegin,
lookAheadAssertionBegin,
lookBehindAssertionBegin,
groupEnd,
Expand All @@ -93,6 +94,7 @@ public enum Kind {
private static final Token NON_CAPTURE_GROUP_BEGIN = new Token(Kind.nonCaptureGroupBegin);
private static final Token CHAR_CLASS_BEGIN = new Token(Kind.charClassBegin);
private static final Token CHAR_CLASS_END = new Token(Kind.charClassEnd);
private static final Token ATOMIC_GROUP_BEGIN = new Token(Kind.atomicGroupBegin);
private static final Token LOOK_AHEAD_ASSERTION_BEGIN = new LookAheadAssertionBegin(false);
private static final Token NEGATIVE_LOOK_AHEAD_ASSERTION_BEGIN = new LookAheadAssertionBegin(true);
private static final Token LOOK_BEHIND_ASSERTION_BEGIN = new LookBehindAssertionBegin(false);
Expand Down Expand Up @@ -139,6 +141,10 @@ public static Token createNonCaptureGroupBegin() {
return NON_CAPTURE_GROUP_BEGIN;
}

public static Token createAtomicGroupBegin() {
return ATOMIC_GROUP_BEGIN;
}

public static Token createLookAheadAssertionBegin() {
return LOOK_AHEAD_ASSERTION_BEGIN;
}
Expand All @@ -163,6 +169,10 @@ public static Quantifier createQuantifier(int min, int max, boolean greedy) {
return new Quantifier(min, max, greedy);
}

public static Quantifier createQuantifier(int min, int max, boolean greedy, boolean possessive) {
return new Quantifier(min, max, greedy, possessive);
}

public static LiteralCharacter createLiteralCharacter(int codePoint) {
return new LiteralCharacter(codePoint);
}
Expand Down Expand Up @@ -244,14 +254,20 @@ public static final class Quantifier extends Token {
private final int min;
private final int max;
private final boolean greedy;
private final boolean possessive;
@CompilationFinal private int index = -1;
@CompilationFinal private int zeroWidthIndex = -1;

public Quantifier(int min, int max, boolean greedy) {
public Quantifier(int min, int max, boolean greedy, boolean possessive) {
super(Kind.quantifier);
this.min = min;
this.max = max;
this.greedy = greedy;
this.possessive = possessive;
}

public Quantifier(int min, int max, boolean greedy) {
this(min, max, greedy, false);
}

public boolean isInfiniteLoop() {
Expand All @@ -276,6 +292,10 @@ public boolean isGreedy() {
return greedy;
}

public boolean isPossessive() {
return possessive;
}

public boolean hasIndex() {
return index >= 0;
}
Expand Down Expand Up @@ -332,11 +352,11 @@ public boolean isDead() {

@Override
public int hashCode() {
return Objects.hash(min, max, greedy, index, zeroWidthIndex);
return Objects.hash(min, max, greedy, possessive, index, zeroWidthIndex);
}

public boolean equalsSemantic(Quantifier o) {
return min == o.min && max == o.max && greedy == o.greedy;
return min == o.min && max == o.max && greedy == o.greedy && possessive == o.possessive;
}

@Override
Expand All @@ -348,14 +368,14 @@ public boolean equals(Object obj) {
return false;
}
Quantifier o = (Quantifier) obj;
return min == o.min && max == o.max && greedy == o.greedy && index == o.index && zeroWidthIndex == o.zeroWidthIndex;
return min == o.min && max == o.max && greedy == o.greedy && possessive == o.possessive && index == o.index && zeroWidthIndex == o.zeroWidthIndex;
}

@TruffleBoundary
@Override
public String toString() {
String ret = minMaxToString();
return isGreedy() ? ret : ret + "?";
return isPossessive() ? ret + "+" : isGreedy() ? ret : ret + "?";
}

private String minMaxToString() {
Expand All @@ -377,7 +397,8 @@ public JsonObject toJson() {
return super.toJson().append(
Json.prop("min", getMin()),
Json.prop("max", getMax()),
Json.prop("greedy", isGreedy()));
Json.prop("greedy", isGreedy()),
Json.prop("possessive", isPossessive()));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ protected boolean featureEnabledBoundedQuantifierEmptyMin() {
return false;
}

@Override
protected boolean featureEnabledPossessiveQuantifiers() {
return false;
}

@Override
protected boolean featureEnabledCharClassFirstBracketIsLiteral() {
return true;
Expand Down
Loading

0 comments on commit 21fb0bb

Please sign in to comment.