From 86156757653112b1a89d7e7f08903ef746fd918b Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 21 Oct 2022 11:13:49 -0700 Subject: [PATCH] API: Update expression sanitization for relative dates and times (#5944) --- .../iceberg/expressions/ExpressionUtil.java | 138 ++++++--- .../expressions/TestExpressionUtil.java | 261 +++++++++++++++++- 2 files changed, 361 insertions(+), 38 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java index 5c0993f4d04b..9d94be163586 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java +++ b/api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java @@ -18,8 +18,13 @@ */ package org.apache.iceberg.expressions; +import java.time.Instant; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; import java.util.List; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -33,11 +38,17 @@ public class ExpressionUtil { private static final Function HASH_FUNC = Transforms.bucket(Integer.MAX_VALUE).bind(Types.StringType.get()); - private static final Pattern DATE = Pattern.compile("\\d\\d\\d\\d-\\d\\d-\\d\\d"); - private static final Pattern TIME = Pattern.compile("\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?"); + private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); + private static final long FIVE_MINUTES_IN_MICROS = TimeUnit.MINUTES.toMicros(5); + private static final long THREE_DAYS_IN_HOURS = TimeUnit.DAYS.toHours(3); + private static final long NINETY_DAYS_IN_HOURS = TimeUnit.DAYS.toHours(90); + private static final Pattern DATE = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); + private static final Pattern TIME = Pattern.compile("\\d{2}:\\d{2}(:\\d{2}(.\\d{1,6})?)?"); private static final Pattern TIMESTAMP = + Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}(:\\d{2}(.\\d{1,6})?)?"); + private static final Pattern TIMESTAMPTZ = Pattern.compile( - "\\d\\d\\d\\d-\\d\\d-\\d\\dT\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?([-+]\\d\\d:\\d\\d)?"); + "\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}(:\\d{2}(.\\d{1,6})?)?([-+]\\d{2}:\\d{2}|Z)"); static final int LONG_IN_PREDICATE_ABBREVIATION_THRESHOLD = 10; private static final int LONG_IN_PREDICATE_ABBREVIATION_MIN_GAIN = 5; @@ -54,7 +65,7 @@ private ExpressionUtil() {} * @return a sanitized Expression */ public static Expression sanitize(Expression expr) { - return ExpressionVisitors.visit(expr, ExpressionSanitizer.INSTANCE); + return ExpressionVisitors.visit(expr, new ExpressionSanitizer()); } /** @@ -68,7 +79,7 @@ public static Expression sanitize(Expression expr) { * @return a sanitized expression string */ public static String toSanitizedString(Expression expr) { - return ExpressionVisitors.visit(expr, StringSanitizer.INSTANCE); + return ExpressionVisitors.visit(expr, new StringSanitizer()); } /** @@ -111,7 +122,15 @@ public static boolean selectsPartitions( private static class ExpressionSanitizer extends ExpressionVisitors.ExpressionVisitor { - private static final ExpressionSanitizer INSTANCE = new ExpressionSanitizer(); + private final long now; + private final int today; + + private ExpressionSanitizer() { + long nowMillis = System.currentTimeMillis(); + OffsetDateTime nowDateTime = Instant.ofEpochMilli(nowMillis).atOffset(ZoneOffset.UTC); + this.now = nowMillis * 1000; + this.today = (int) ChronoUnit.DAYS.between(EPOCH, nowDateTime); + } @Override public Expression alwaysTrue() { @@ -161,11 +180,12 @@ public Expression predicate(UnboundPredicate pred) { case NOT_EQ: case STARTS_WITH: case NOT_STARTS_WITH: - return new UnboundPredicate<>(pred.op(), pred.term(), (T) sanitize(pred.literal())); + return new UnboundPredicate<>( + pred.op(), pred.term(), (T) sanitize(pred.literal(), now, today)); case IN: case NOT_IN: Iterable iter = - () -> pred.literals().stream().map(ExpressionUtil::sanitize).iterator(); + () -> pred.literals().stream().map(lit -> sanitize(lit, now, today)).iterator(); return new UnboundPredicate<>(pred.op(), pred.term(), (Iterable) iter); default: throw new UnsupportedOperationException( @@ -175,7 +195,15 @@ public Expression predicate(UnboundPredicate pred) { } private static class StringSanitizer extends ExpressionVisitors.ExpressionVisitor { - private static final StringSanitizer INSTANCE = new StringSanitizer(); + private final long nowMicros; + private final int today; + + private StringSanitizer() { + long nowMillis = System.currentTimeMillis(); + OffsetDateTime nowDateTime = Instant.ofEpochMilli(nowMillis).atOffset(ZoneOffset.UTC); + this.nowMicros = nowMillis * 1000; + this.today = (int) ChronoUnit.DAYS.between(EPOCH, nowDateTime); + } @Override public String alwaysTrue() { @@ -230,23 +258,23 @@ public String predicate(UnboundPredicate pred) { case NOT_NAN: return "not_nan(" + term + ")"; case LT: - return term + " < " + sanitize(pred.literal()); + return term + " < " + sanitize(pred.literal(), nowMicros, today); case LT_EQ: - return term + " <= " + sanitize(pred.literal()); + return term + " <= " + sanitize(pred.literal(), nowMicros, today); case GT: - return term + " > " + sanitize(pred.literal()); + return term + " > " + sanitize(pred.literal(), nowMicros, today); case GT_EQ: - return term + " >= " + sanitize(pred.literal()); + return term + " >= " + sanitize(pred.literal(), nowMicros, today); case EQ: - return term + " = " + sanitize(pred.literal()); + return term + " = " + sanitize(pred.literal(), nowMicros, today); case NOT_EQ: - return term + " != " + sanitize(pred.literal()); + return term + " != " + sanitize(pred.literal(), nowMicros, today); case IN: return term + " IN " + abbreviateValues( pred.literals().stream() - .map(ExpressionUtil::sanitize) + .map(lit -> sanitize(lit, nowMicros, today)) .collect(Collectors.toList())) .stream() .collect(Collectors.joining(", ", "(", ")")); @@ -255,14 +283,14 @@ public String predicate(UnboundPredicate pred) { + " NOT IN " + abbreviateValues( pred.literals().stream() - .map(ExpressionUtil::sanitize) + .map(lit -> sanitize(lit, nowMicros, today)) .collect(Collectors.toList())) .stream() .collect(Collectors.joining(", ", "(", ")")); case STARTS_WITH: - return term + " STARTS WITH " + sanitize(pred.literal()); + return term + " STARTS WITH " + sanitize(pred.literal(), nowMicros, today); case NOT_STARTS_WITH: - return term + " NOT STARTS WITH " + sanitize(pred.literal()); + return term + " NOT STARTS WITH " + sanitize(pred.literal(), nowMicros, today); default: throw new UnsupportedOperationException( "Cannot sanitize unsupported predicate type: " + pred.op()); @@ -279,7 +307,7 @@ private static List abbreviateValues(List sanitizedValues) { abbreviatedList.addAll(distinctValues); abbreviatedList.add( String.format( - "... (%d values hidden, %d in total) ...", + "... (%d values hidden, %d in total)", sanitizedValues.size() - distinctValues.size(), sanitizedValues.size())); return abbreviatedList; } @@ -287,24 +315,15 @@ private static List abbreviateValues(List sanitizedValues) { return sanitizedValues; } - private static String sanitize(Literal literal) { + private static String sanitize(Literal literal, long now, int today) { if (literal instanceof Literals.StringLiteral) { - CharSequence value = ((Literals.StringLiteral) literal).value(); - if (DATE.matcher(value).matches()) { - return "(date)"; - } else if (TIME.matcher(value).matches()) { - return "(time)"; - } else if (TIMESTAMP.matcher(value).matches()) { - return "(timestamp)"; - } else { - return sanitizeString(value); - } + return sanitizeString(((Literals.StringLiteral) literal).value(), now, today); } else if (literal instanceof Literals.DateLiteral) { - return "(date)"; + return sanitizeDate(((Literals.DateLiteral) literal).value(), today); + } else if (literal instanceof Literals.TimestampLiteral) { + return sanitizeTimestamp(((Literals.TimestampLiteral) literal).value(), now); } else if (literal instanceof Literals.TimeLiteral) { return "(time)"; - } else if (literal instanceof Literals.TimestampLiteral) { - return "(timestamp)"; } else if (literal instanceof Literals.IntegerLiteral) { return sanitizeNumber(((Literals.IntegerLiteral) literal).value(), "int"); } else if (literal instanceof Literals.LongLiteral) { @@ -315,8 +334,38 @@ private static String sanitize(Literal literal) { return sanitizeNumber(((Literals.DoubleLiteral) literal).value(), "float"); } else { // for uuid, decimal, fixed, and binary, match the string result - return sanitizeString(literal.value().toString()); + return sanitizeSimpleString(literal.value().toString()); + } + } + + private static String sanitizeDate(int days, int today) { + String isPast = today > days ? "ago" : "from-now"; + int diff = Math.abs(today - days); + if (diff == 0) { + return "(date-today)"; + } else if (diff < 90) { + return "(date-" + diff + "-days-" + isPast + ")"; } + + return "(date)"; + } + + private static String sanitizeTimestamp(long micros, long now) { + String isPast = now > micros ? "ago" : "from-now"; + long diff = Math.abs(now - micros); + if (diff < FIVE_MINUTES_IN_MICROS) { + return "(timestamp-about-now)"; + } + + long hours = TimeUnit.MICROSECONDS.toHours(diff); + if (hours <= THREE_DAYS_IN_HOURS) { + return "(timestamp-" + hours + "-hours-" + isPast + ")"; + } else if (hours < NINETY_DAYS_IN_HOURS) { + long days = hours / 24; + return "(timestamp-" + days + "-days-" + isPast + ")"; + } + + return "(timestamp)"; } private static String sanitizeNumber(Number value, String type) { @@ -326,7 +375,24 @@ private static String sanitizeNumber(Number value, String type) { return "(" + numDigits + "-digit-" + type + ")"; } - private static String sanitizeString(CharSequence value) { + private static String sanitizeString(CharSequence value, long now, int today) { + if (DATE.matcher(value).matches()) { + Literal date = Literal.of(value).to(Types.DateType.get()); + return sanitizeDate(date.value(), today); + } else if (TIMESTAMP.matcher(value).matches()) { + Literal ts = Literal.of(value).to(Types.TimestampType.withoutZone()); + return sanitizeTimestamp(ts.value(), now); + } else if (TIMESTAMPTZ.matcher(value).matches()) { + Literal ts = Literal.of(value).to(Types.TimestampType.withZone()); + return sanitizeTimestamp(ts.value(), now); + } else if (TIME.matcher(value).matches()) { + return "(time)"; + } else { + return sanitizeSimpleString(value); + } + } + + private static String sanitizeSimpleString(CharSequence value) { // hash the value and return the hash as hex return String.format("(hash-%08x)", HASH_FUNC.apply(value)); } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java index a60de0a2a182..5129d46871de 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java @@ -18,6 +18,9 @@ */ package org.apache.iceberg.expressions; +import java.time.LocalDate; +import java.time.OffsetDateTime; +import java.time.ZoneOffset; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -83,7 +86,7 @@ public void testSanitizeLongIn() { Assert.assertEquals( "Sanitized string should be abbreviated", - "test IN ((2-digit-int), (3-digit-int), ... (8 values hidden, 10 in total) ...)", + "test IN ((2-digit-int), (3-digit-int), ... (8 values hidden, 10 in total))", ExpressionUtil.toSanitizedString(Expressions.in("test", tooLongRange))); // The sanitization resulting in an expression tree does not abbreviate @@ -140,7 +143,7 @@ public void testSanitizeLongNotIn() { Assert.assertEquals( "Sanitized string should be abbreviated", - "test NOT IN ((2-digit-int), (3-digit-int), ... (8 values hidden, 10 in total) ...)", + "test NOT IN ((2-digit-int), (3-digit-int), ... (8 values hidden, 10 in total))", ExpressionUtil.toSanitizedString(Expressions.notIn("test", tooLongRange))); // The sanitization resulting in an expression tree does not abbreviate @@ -339,6 +342,260 @@ public void testSanitizeTimestamp() { } } + @Test + public void testSanitizeTimestampAboutNow() { + // this string is the current UTC time, without a zone offset + String nowLocal = + OffsetDateTime.now().atZoneSameInstant(ZoneOffset.UTC).toLocalDateTime().toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-about-now)"), + ExpressionUtil.sanitize(Expressions.equal("test", nowLocal))); + + assertEquals( + Expressions.equal("test", "(timestamp-about-now)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(nowLocal).to(Types.TimestampType.withoutZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-about-now)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", nowLocal))); + } + + @Test + public void testSanitizeTimestampPast() { + String ninetyMinutesAgoLocal = + OffsetDateTime.now() + .minusMinutes(90) + .atZoneSameInstant(ZoneOffset.UTC) + .toLocalDateTime() + .toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-ago)"), + ExpressionUtil.sanitize(Expressions.equal("test", ninetyMinutesAgoLocal))); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-ago)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(ninetyMinutesAgoLocal).to(Types.TimestampType.withoutZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-1-hours-ago)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", ninetyMinutesAgoLocal))); + } + + @Test + public void testSanitizeTimestampLastWeek() { + String lastWeekLocal = + OffsetDateTime.now() + .minusHours(180) + .atZoneSameInstant(ZoneOffset.UTC) + .toLocalDateTime() + .toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-7-days-ago)"), + ExpressionUtil.sanitize(Expressions.equal("test", lastWeekLocal))); + + assertEquals( + Expressions.equal("test", "(timestamp-7-days-ago)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(lastWeekLocal).to(Types.TimestampType.withoutZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-7-days-ago)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", lastWeekLocal))); + } + + @Test + public void testSanitizeTimestampFuture() { + String ninetyMinutesFromNowLocal = + OffsetDateTime.now() + .plusMinutes(90) + .atZoneSameInstant(ZoneOffset.UTC) + .toLocalDateTime() + .toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-from-now)"), + ExpressionUtil.sanitize(Expressions.equal("test", ninetyMinutesFromNowLocal))); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-from-now)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(ninetyMinutesFromNowLocal).to(Types.TimestampType.withoutZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-1-hours-from-now)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", ninetyMinutesFromNowLocal))); + } + + @Test + public void testSanitizeTimestamptzAboutNow() { + // this string is the current time with the local zone offset + String nowUtc = OffsetDateTime.now().toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-about-now)"), + ExpressionUtil.sanitize(Expressions.equal("test", nowUtc))); + + assertEquals( + Expressions.equal("test", "(timestamp-about-now)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(nowUtc).to(Types.TimestampType.withZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-about-now)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", nowUtc))); + } + + @Test + public void testSanitizeTimestamptzPast() { + String ninetyMinutesAgoUtc = OffsetDateTime.now().minusMinutes(90).toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-ago)"), + ExpressionUtil.sanitize(Expressions.equal("test", ninetyMinutesAgoUtc))); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-ago)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(ninetyMinutesAgoUtc).to(Types.TimestampType.withZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-1-hours-ago)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", ninetyMinutesAgoUtc))); + } + + @Test + public void testSanitizeTimestamptzLastWeek() { + String lastWeekUtc = OffsetDateTime.now().minusHours(180).toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-7-days-ago)"), + ExpressionUtil.sanitize(Expressions.equal("test", lastWeekUtc))); + + assertEquals( + Expressions.equal("test", "(timestamp-7-days-ago)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(lastWeekUtc).to(Types.TimestampType.withZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-7-days-ago)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", lastWeekUtc))); + } + + @Test + public void testSanitizeTimestamptzFuture() { + String ninetyMinutesFromNowUtc = OffsetDateTime.now().plusMinutes(90).toString(); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-from-now)"), + ExpressionUtil.sanitize(Expressions.equal("test", ninetyMinutesFromNowUtc))); + + assertEquals( + Expressions.equal("test", "(timestamp-1-hours-from-now)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, + "test", + Literal.of(ninetyMinutesFromNowUtc).to(Types.TimestampType.withZone())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (timestamp-1-hours-from-now)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", ninetyMinutesFromNowUtc))); + } + + @Test + public void testSanitizeDateToday() { + String today = LocalDate.now().toString(); + + assertEquals( + Expressions.equal("test", "(date-today)"), + ExpressionUtil.sanitize(Expressions.equal("test", today))); + + assertEquals( + Expressions.equal("test", "(date-today)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, "test", Literal.of(today).to(Types.DateType.get())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (date-today)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", today))); + } + + @Test + public void testSanitizeDateLastWeek() { + String lastWeek = LocalDate.now().minusWeeks(1).toString(); + + assertEquals( + Expressions.equal("test", "(date-7-days-ago)"), + ExpressionUtil.sanitize(Expressions.equal("test", lastWeek))); + + assertEquals( + Expressions.equal("test", "(date-7-days-ago)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, "test", Literal.of(lastWeek).to(Types.DateType.get())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (date-7-days-ago)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", lastWeek))); + } + + @Test + public void testSanitizeDateNextWeek() { + String nextWeek = LocalDate.now().plusWeeks(1).toString(); + + assertEquals( + Expressions.equal("test", "(date-7-days-from-now)"), + ExpressionUtil.sanitize(Expressions.equal("test", nextWeek))); + + assertEquals( + Expressions.equal("test", "(date-7-days-from-now)"), + ExpressionUtil.sanitize( + Expressions.predicate( + Expression.Operation.EQ, "test", Literal.of(nextWeek).to(Types.DateType.get())))); + + Assert.assertEquals( + "Sanitized string should be identical except for descriptive literal", + "test = (date-7-days-from-now)", + ExpressionUtil.toSanitizedString(Expressions.equal("test", nextWeek))); + } + @Test public void testIdenticalExpressionIsEquivalent() { Expression[] exprs =