Skip to content

Commit

Permalink
API: Update expression sanitization for relative dates and times (apa…
Browse files Browse the repository at this point in the history
  • Loading branch information
rdblue authored Oct 21, 2022
1 parent c103b93 commit 8615675
Show file tree
Hide file tree
Showing 2 changed files with 361 additions and 38 deletions.
138 changes: 102 additions & 36 deletions api/src/main/java/org/apache/iceberg/expressions/ExpressionUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,13 @@
*/
package org.apache.iceberg.expressions;

import java.time.Instant;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand All @@ -33,11 +38,17 @@
public class ExpressionUtil {
private static final Function<Object, Integer> HASH_FUNC =
Transforms.bucket(Integer.MAX_VALUE).bind(Types.StringType.get());
private static final Pattern DATE = Pattern.compile("\\d\\d\\d\\d-\\d\\d-\\d\\d");
private static final Pattern TIME = Pattern.compile("\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?");
private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC);
private static final long FIVE_MINUTES_IN_MICROS = TimeUnit.MINUTES.toMicros(5);
private static final long THREE_DAYS_IN_HOURS = TimeUnit.DAYS.toHours(3);
private static final long NINETY_DAYS_IN_HOURS = TimeUnit.DAYS.toHours(90);
private static final Pattern DATE = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");
private static final Pattern TIME = Pattern.compile("\\d{2}:\\d{2}(:\\d{2}(.\\d{1,6})?)?");
private static final Pattern TIMESTAMP =
Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}(:\\d{2}(.\\d{1,6})?)?");
private static final Pattern TIMESTAMPTZ =
Pattern.compile(
"\\d\\d\\d\\d-\\d\\d-\\d\\dT\\d\\d:\\d\\d(:\\d\\d(.\\d{1,6})?)?([-+]\\d\\d:\\d\\d)?");
"\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}(:\\d{2}(.\\d{1,6})?)?([-+]\\d{2}:\\d{2}|Z)");
static final int LONG_IN_PREDICATE_ABBREVIATION_THRESHOLD = 10;
private static final int LONG_IN_PREDICATE_ABBREVIATION_MIN_GAIN = 5;

Expand All @@ -54,7 +65,7 @@ private ExpressionUtil() {}
* @return a sanitized Expression
*/
public static Expression sanitize(Expression expr) {
return ExpressionVisitors.visit(expr, ExpressionSanitizer.INSTANCE);
return ExpressionVisitors.visit(expr, new ExpressionSanitizer());
}

/**
Expand All @@ -68,7 +79,7 @@ public static Expression sanitize(Expression expr) {
* @return a sanitized expression string
*/
public static String toSanitizedString(Expression expr) {
return ExpressionVisitors.visit(expr, StringSanitizer.INSTANCE);
return ExpressionVisitors.visit(expr, new StringSanitizer());
}

/**
Expand Down Expand Up @@ -111,7 +122,15 @@ public static boolean selectsPartitions(

private static class ExpressionSanitizer
extends ExpressionVisitors.ExpressionVisitor<Expression> {
private static final ExpressionSanitizer INSTANCE = new ExpressionSanitizer();
private final long now;
private final int today;

private ExpressionSanitizer() {
long nowMillis = System.currentTimeMillis();
OffsetDateTime nowDateTime = Instant.ofEpochMilli(nowMillis).atOffset(ZoneOffset.UTC);
this.now = nowMillis * 1000;
this.today = (int) ChronoUnit.DAYS.between(EPOCH, nowDateTime);
}

@Override
public Expression alwaysTrue() {
Expand Down Expand Up @@ -161,11 +180,12 @@ public <T> Expression predicate(UnboundPredicate<T> pred) {
case NOT_EQ:
case STARTS_WITH:
case NOT_STARTS_WITH:
return new UnboundPredicate<>(pred.op(), pred.term(), (T) sanitize(pred.literal()));
return new UnboundPredicate<>(
pred.op(), pred.term(), (T) sanitize(pred.literal(), now, today));
case IN:
case NOT_IN:
Iterable<String> iter =
() -> pred.literals().stream().map(ExpressionUtil::sanitize).iterator();
() -> pred.literals().stream().map(lit -> sanitize(lit, now, today)).iterator();
return new UnboundPredicate<>(pred.op(), pred.term(), (Iterable<T>) iter);
default:
throw new UnsupportedOperationException(
Expand All @@ -175,7 +195,15 @@ public <T> Expression predicate(UnboundPredicate<T> pred) {
}

private static class StringSanitizer extends ExpressionVisitors.ExpressionVisitor<String> {
private static final StringSanitizer INSTANCE = new StringSanitizer();
private final long nowMicros;
private final int today;

private StringSanitizer() {
long nowMillis = System.currentTimeMillis();
OffsetDateTime nowDateTime = Instant.ofEpochMilli(nowMillis).atOffset(ZoneOffset.UTC);
this.nowMicros = nowMillis * 1000;
this.today = (int) ChronoUnit.DAYS.between(EPOCH, nowDateTime);
}

@Override
public String alwaysTrue() {
Expand Down Expand Up @@ -230,23 +258,23 @@ public <T> String predicate(UnboundPredicate<T> pred) {
case NOT_NAN:
return "not_nan(" + term + ")";
case LT:
return term + " < " + sanitize(pred.literal());
return term + " < " + sanitize(pred.literal(), nowMicros, today);
case LT_EQ:
return term + " <= " + sanitize(pred.literal());
return term + " <= " + sanitize(pred.literal(), nowMicros, today);
case GT:
return term + " > " + sanitize(pred.literal());
return term + " > " + sanitize(pred.literal(), nowMicros, today);
case GT_EQ:
return term + " >= " + sanitize(pred.literal());
return term + " >= " + sanitize(pred.literal(), nowMicros, today);
case EQ:
return term + " = " + sanitize(pred.literal());
return term + " = " + sanitize(pred.literal(), nowMicros, today);
case NOT_EQ:
return term + " != " + sanitize(pred.literal());
return term + " != " + sanitize(pred.literal(), nowMicros, today);
case IN:
return term
+ " IN "
+ abbreviateValues(
pred.literals().stream()
.map(ExpressionUtil::sanitize)
.map(lit -> sanitize(lit, nowMicros, today))
.collect(Collectors.toList()))
.stream()
.collect(Collectors.joining(", ", "(", ")"));
Expand All @@ -255,14 +283,14 @@ public <T> String predicate(UnboundPredicate<T> pred) {
+ " NOT IN "
+ abbreviateValues(
pred.literals().stream()
.map(ExpressionUtil::sanitize)
.map(lit -> sanitize(lit, nowMicros, today))
.collect(Collectors.toList()))
.stream()
.collect(Collectors.joining(", ", "(", ")"));
case STARTS_WITH:
return term + " STARTS WITH " + sanitize(pred.literal());
return term + " STARTS WITH " + sanitize(pred.literal(), nowMicros, today);
case NOT_STARTS_WITH:
return term + " NOT STARTS WITH " + sanitize(pred.literal());
return term + " NOT STARTS WITH " + sanitize(pred.literal(), nowMicros, today);
default:
throw new UnsupportedOperationException(
"Cannot sanitize unsupported predicate type: " + pred.op());
Expand All @@ -279,32 +307,23 @@ private static <T> List<String> abbreviateValues(List<String> sanitizedValues) {
abbreviatedList.addAll(distinctValues);
abbreviatedList.add(
String.format(
"... (%d values hidden, %d in total) ...",
"... (%d values hidden, %d in total)",
sanitizedValues.size() - distinctValues.size(), sanitizedValues.size()));
return abbreviatedList;
}
}
return sanitizedValues;
}

private static String sanitize(Literal<?> literal) {
private static String sanitize(Literal<?> literal, long now, int today) {
if (literal instanceof Literals.StringLiteral) {
CharSequence value = ((Literals.StringLiteral) literal).value();
if (DATE.matcher(value).matches()) {
return "(date)";
} else if (TIME.matcher(value).matches()) {
return "(time)";
} else if (TIMESTAMP.matcher(value).matches()) {
return "(timestamp)";
} else {
return sanitizeString(value);
}
return sanitizeString(((Literals.StringLiteral) literal).value(), now, today);
} else if (literal instanceof Literals.DateLiteral) {
return "(date)";
return sanitizeDate(((Literals.DateLiteral) literal).value(), today);
} else if (literal instanceof Literals.TimestampLiteral) {
return sanitizeTimestamp(((Literals.TimestampLiteral) literal).value(), now);
} else if (literal instanceof Literals.TimeLiteral) {
return "(time)";
} else if (literal instanceof Literals.TimestampLiteral) {
return "(timestamp)";
} else if (literal instanceof Literals.IntegerLiteral) {
return sanitizeNumber(((Literals.IntegerLiteral) literal).value(), "int");
} else if (literal instanceof Literals.LongLiteral) {
Expand All @@ -315,8 +334,38 @@ private static String sanitize(Literal<?> literal) {
return sanitizeNumber(((Literals.DoubleLiteral) literal).value(), "float");
} else {
// for uuid, decimal, fixed, and binary, match the string result
return sanitizeString(literal.value().toString());
return sanitizeSimpleString(literal.value().toString());
}
}

private static String sanitizeDate(int days, int today) {
String isPast = today > days ? "ago" : "from-now";
int diff = Math.abs(today - days);
if (diff == 0) {
return "(date-today)";
} else if (diff < 90) {
return "(date-" + diff + "-days-" + isPast + ")";
}

return "(date)";
}

private static String sanitizeTimestamp(long micros, long now) {
String isPast = now > micros ? "ago" : "from-now";
long diff = Math.abs(now - micros);
if (diff < FIVE_MINUTES_IN_MICROS) {
return "(timestamp-about-now)";
}

long hours = TimeUnit.MICROSECONDS.toHours(diff);
if (hours <= THREE_DAYS_IN_HOURS) {
return "(timestamp-" + hours + "-hours-" + isPast + ")";
} else if (hours < NINETY_DAYS_IN_HOURS) {
long days = hours / 24;
return "(timestamp-" + days + "-days-" + isPast + ")";
}

return "(timestamp)";
}

private static String sanitizeNumber(Number value, String type) {
Expand All @@ -326,7 +375,24 @@ private static String sanitizeNumber(Number value, String type) {
return "(" + numDigits + "-digit-" + type + ")";
}

private static String sanitizeString(CharSequence value) {
private static String sanitizeString(CharSequence value, long now, int today) {
if (DATE.matcher(value).matches()) {
Literal<Integer> date = Literal.of(value).to(Types.DateType.get());
return sanitizeDate(date.value(), today);
} else if (TIMESTAMP.matcher(value).matches()) {
Literal<Long> ts = Literal.of(value).to(Types.TimestampType.withoutZone());
return sanitizeTimestamp(ts.value(), now);
} else if (TIMESTAMPTZ.matcher(value).matches()) {
Literal<Long> ts = Literal.of(value).to(Types.TimestampType.withZone());
return sanitizeTimestamp(ts.value(), now);
} else if (TIME.matcher(value).matches()) {
return "(time)";
} else {
return sanitizeSimpleString(value);
}
}

private static String sanitizeSimpleString(CharSequence value) {
// hash the value and return the hash as hex
return String.format("(hash-%08x)", HASH_FUNC.apply(value));
}
Expand Down
Loading

0 comments on commit 8615675

Please sign in to comment.