Skip to content

Commit

Permalink
Revert "NIFI-4272 support multiple captures when EL is present in rep…
Browse files Browse the repository at this point in the history
…lacement value"

This reverts commit f7f809c.

NIFI-5474, NIFI-4272: When using Regex Replace with ReplaceText, and there are capturing groups, ensure that we populate the 'additionalVariables' map for each match of the regex. This allows Expression Language to reference the back-references properly even when there are multiple matches

Signed-off-by: Matthew Burgess <[email protected]>

This closes apache#2951
  • Loading branch information
markap14 authored and mattyb149 committed Sep 11, 2018
1 parent 87cf474 commit 0274bd4
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 167 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@
@SystemResourceConsideration(resource = SystemResource.MEMORY)
public class ReplaceText extends AbstractProcessor {

private static Pattern QUOTED_GROUP_REF_PATTERN = Pattern.compile("\\$\\{\\s*?'\\$\\d+?'.+?\\}");
private static Pattern DOUBLE_QUOTED_GROUP_REF_PATTERN = Pattern.compile("\\$\\{\\s*?\"\\$\\d+?\".+?\\}");
private static Pattern LITERAL_QUOTED_PATTERN = Pattern.compile("literal\\(('.*?')\\)",Pattern.DOTALL);
private static Pattern REPLACEMENT_NORMALIZATION_PATTERN = Pattern.compile("(\\$\\D)");

// Constants
public static final String LINE_BY_LINE = "Line-by-Line";
Expand Down Expand Up @@ -295,9 +293,7 @@ public void onTrigger(final ProcessContext context, final ProcessSession session
final StopWatch stopWatch = new StopWatch(true);

try {

flowFile = replacementStrategyExecutor.replace(flowFile, session, context, evaluateMode, charset, maxBufferSize);

} catch (StackOverflowError e) {
// Some regular expressions can produce many matches on large input data size using recursive code
// do not log the StackOverflowError stack trace
Expand All @@ -314,8 +310,12 @@ public void onTrigger(final ProcessContext context, final ProcessSession session

// If we find a back reference that is not valid, then we will treat it as a literal string. For example, if we have 3 capturing
// groups and the Replacement Value has the value is "I owe $8 to him", then we want to treat the $8 as a literal "$8", rather
// than attempting to use it as a back reference. We do this even if there are no capture groups.
// than attempting to use it as a back reference.
private static String escapeLiteralBackReferences(final String unescaped, final int numCapturingGroups) {
if (numCapturingGroups == 0) {
return unescaped;
}

String value = unescaped;
final Matcher backRefMatcher = unescapedBackReferencePattern.matcher(value); // consider unescaped back references
while (backRefMatcher.find()) {
Expand Down Expand Up @@ -523,12 +523,8 @@ public RegexReplace(final byte[] buffer, final ProcessContext context) {

@Override
public FlowFile replace(final FlowFile flowFile, final ProcessSession session, final ProcessContext context, final String evaluateMode, final Charset charset, final int maxBufferSize) {
final AttributeValueDecorator quotedAttributeDecorator = new AttributeValueDecorator() {
@Override
public String decorate(final String attributeValue) {
return Pattern.quote(attributeValue);
}
};
final AttributeValueDecorator quotedAttributeDecorator = Pattern::quote;

final String searchRegex = context.getProperty(SEARCH_VALUE).evaluateAttributeExpressions(flowFile, quotedAttributeDecorator).getValue();
final Pattern searchPattern = Pattern.compile(searchRegex);

Expand All @@ -545,61 +541,79 @@ public void process(final InputStream in) throws IOException {
final String contentString = new String(buffer, 0, flowFileSize, charset);
additionalAttrs.clear();
final Matcher matcher = searchPattern.matcher(contentString);
if (matcher.find()) {
for (int i = 1; i <= matcher.groupCount(); i++) {
final String groupValue = matcher.group(i);
additionalAttrs.put("$" + i, groupValue);

final PropertyValue replacementValueProperty = context.getProperty(REPLACEMENT_VALUE);

int matches = 0;
final StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matches++;

for (int i=0; i <= matcher.groupCount(); i++) {
additionalAttrs.put("$" + i, matcher.group(i));
}

// prepare the string and do the regex replace first
// then evaluate the EL on the result
String replacement = context.getProperty(REPLACEMENT_VALUE).getValue();
String replacement = replacementValueProperty.evaluateAttributeExpressions(flowFile, additionalAttrs, escapeBackRefDecorator).getValue();
replacement = escapeLiteralBackReferences(replacement, numCapturingGroups);
replacement = escapeExpressionDollarSigns(replacement);
replacement = wrapLiterals(replacement);
replacement = contentString.replaceAll(searchRegex, replacement);
replacement = escapeForEvaluation(replacement);
String replacementFinal = normalizeReplacementString(replacement);

matcher.appendReplacement(sb, replacementFinal);
}

PropertyValue tempValue = context.newPropertyValue(replacement);
final String updatedValue = tempValue.evaluateAttributeExpressions(flowFile, additionalAttrs, null).getValue();
if (matches > 0) {
matcher.appendTail(sb);

final String updatedValue = sb.toString();
updatedFlowFile = session.write(flowFile, new OutputStreamCallback() {
@Override
public void process(final OutputStream out) throws IOException {
out.write(updatedValue.getBytes(charset));
}
});
} else {
// If no match, just return the original. No need to write out any content.
return flowFile;
}

} else {
updatedFlowFile = session.write(flowFile, new StreamCallback() {
@Override
public void process(final InputStream in, final OutputStream out) throws IOException {
try (NLKBufferedReader br = new NLKBufferedReader(new InputStreamReader(in, charset), maxBufferSize);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(out, charset))) {
String oneLine;

final StringBuffer sb = new StringBuffer();
Matcher matcher = null;

while (null != (oneLine = br.readLine())) {
additionalAttrs.clear();
final Matcher matcher = searchPattern.matcher(oneLine);
if (matcher.find()) {
for (int i = 1; i <= matcher.groupCount(); i++) {
final String groupValue = matcher.group(i);
additionalAttrs.put("$" + i, groupValue);
if (matcher == null) {
matcher = searchPattern.matcher(oneLine);
} else {
matcher.reset(oneLine);
}

int matches = 0;
sb.setLength(0);

while (matcher.find()) {
matches++;

for (int i=0; i <= matcher.groupCount(); i++) {
additionalAttrs.put("$" + i, matcher.group(i));
}

// prepare the string and do the regex replace first
// then evaluate the EL on the result
String replacement = context.getProperty(REPLACEMENT_VALUE).getValue();
String replacement = context.getProperty(REPLACEMENT_VALUE).evaluateAttributeExpressions(flowFile, additionalAttrs, escapeBackRefDecorator).getValue();
replacement = escapeLiteralBackReferences(replacement, numCapturingGroups);
replacement = escapeExpressionDollarSigns(replacement);
replacement = wrapLiterals(replacement);
replacement = oneLine.replaceAll(searchRegex, replacement);
replacement = escapeForEvaluation(replacement);
String replacementFinal = normalizeReplacementString(replacement);

matcher.appendReplacement(sb, replacementFinal);
}

PropertyValue tempValue = context.newPropertyValue(replacement);
final String updatedValue = tempValue.evaluateAttributeExpressions(flowFile, additionalAttrs, null).getValue();
if (matches > 0) {
matcher.appendTail(sb);

final String updatedValue = sb.toString();
bw.write(updatedValue);
} else {
// No match. Just write out the line as it was.
Expand Down Expand Up @@ -678,77 +692,17 @@ public boolean isAllDataBufferedForEntireText() {
}
}

/**
* Wraps '$1' with the {@code literal} function for EL evaluation.
* @param possibleLiteral the {@code String} to evaluate.
* @return {@code String} with literals wrapped. If no literals or Expression Lanaguage present the passed string
* is returned.
*/
private static String wrapLiterals(String possibleLiteral) {
String replacementFinal = possibleLiteral;
if (!possibleLiteral.contains("${")) {
return possibleLiteral;
}

if (QUOTED_GROUP_REF_PATTERN.matcher(replacementFinal).find()) {
replacementFinal = replacementFinal.replaceAll("(\\$\\{\\s*?)('\\$\\d+?')(.*\\})", "$1literal($2)$3");
}

if (DOUBLE_QUOTED_GROUP_REF_PATTERN.matcher(replacementFinal).find()) {
replacementFinal = replacementFinal.replaceAll("(\\$\\{\\s*?)(\"\\$\\d+?\")(.*\\})", "$1literal($2)$3");
}

return replacementFinal;
}

/**
* If we have a '$' followed by anything other than a number, then escape
* it if it is not already escaped. E.g., '$d' becomes '\$d' so that it can be used as a literal in a
* it. E.g., '$d' becomes '\$d' so that it can be used as a literal in a
* regex.
*/
private static String escapeExpressionDollarSigns(String replacement) {

// are there expressions or group references
if (replacement.indexOf('$') == -1) {
return replacement;
private static String normalizeReplacementString(String replacement) {
String replacementFinal = replacement;
if (REPLACEMENT_NORMALIZATION_PATTERN.matcher(replacement).find()) {
replacementFinal = Matcher.quoteReplacement(replacement);
}
StringBuilder sb = new StringBuilder();
boolean lastWasEscape = false;
for (int i=0; i<replacement.length(); i++) {
char c = replacement.charAt(i);
if (c == '\\' ) {
lastWasEscape = true;
} else {
if ( c == '$') {
if (!lastWasEscape && !Character.isDigit(replacement.charAt(i+1))) {
sb.append('\\');
}
}
lastWasEscape = false;
}
sb.append(c);
}
return sb.toString();
}

/**
* Escapes a {@code String} containing literal('') EL values.
* @param contentString the {@code String}
* @return the escaped {@code String}. If no literal() is present, then the input {@code String} will be returned
*/
private static String escapeForEvaluation(String contentString) {
final Matcher matcher = LITERAL_QUOTED_PATTERN.matcher(contentString);
String returnString = contentString;
while(matcher.find()) {
for (int i = 1; i <= matcher.groupCount(); i ++) {
String replacement = matcher.group(i)
.replaceAll("\\n","\\\\n")
.replaceAll("\\r","\\\\r")
.replaceAll("\\t","\\\\t");
returnString = new StringBuilder(returnString).replace(matcher.start(i),matcher.end(i),replacement).toString();
}
}
return returnString;
return replacementFinal;
}

private interface ReplacementStrategyExecutor {
Expand Down
Loading

0 comments on commit 0274bd4

Please sign in to comment.