From f6fdeb0e63f3058eabf860bd893bea24f6649b4c Mon Sep 17 00:00:00 2001 From: Yufei Gu Date: Tue, 16 Nov 2021 12:01:24 -0800 Subject: [PATCH] Core: Optimize CharSeq equality check for path filtering (#3530) --- .../org/apache/iceberg/deletes/Deletes.java | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java index 41c7793c769f..6bb51c7fdead 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java +++ b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java @@ -21,7 +21,6 @@ import java.io.IOException; import java.io.UncheckedIOException; -import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.function.Function; @@ -37,7 +36,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Filter; import org.apache.iceberg.util.SortedMerge; @@ -252,7 +250,6 @@ public void close() { } private static class DataFileFilter extends Filter { - private static final Comparator CHARSEQ_COMPARATOR = Comparators.charSequences(); private final CharSequence dataLocation; DataFileFilter(CharSequence dataLocation) { @@ -261,7 +258,32 @@ private static class DataFileFilter extends Filter { @Override protected boolean shouldKeep(T posDelete) { - return CHARSEQ_COMPARATOR.compare(dataLocation, (CharSequence) FILENAME_ACCESSOR.get(posDelete)) == 0; + return charSeqEquals(dataLocation, (CharSequence) FILENAME_ACCESSOR.get(posDelete)); + } + + private boolean charSeqEquals(CharSequence s1, CharSequence s2) { + if (s1 == s2) { + return true; + } + + int count = s1.length(); + if (count != s2.length()) { + return false; + } + + if (s1 instanceof String && s2 instanceof String && s1.hashCode() != s2.hashCode()) { + return false; + } + + // File paths inside a delete file normally have more identical chars at the beginning. For example, a typical + // path is like "s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet". + // The uuid is where the difference starts. So it's faster to find the first diff backward. + for (int i = count - 1; i >= 0; i--) { + if (s1.charAt(i) != s2.charAt(i)) { + return false; + } + } + return true; } } }