Core: Optimize CharSeq equality check for path filtering (apache#3530)

Yanam · Nov 16, 2021 · f6fdeb0 · f6fdeb0
1 parent 36e261b
commit f6fdeb0
Showing 1 changed file with 26 additions and 4 deletions.
diff --git a/core/src/main/java/org/apache/iceberg/deletes/Deletes.java b/core/src/main/java/org/apache/iceberg/deletes/Deletes.java
@@ -21,7 +21,6 @@
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
-import java.util.Comparator;
 import java.util.List;
 import java.util.Set;
 import java.util.function.Function;
@@ -37,7 +36,6 @@
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
-import org.apache.iceberg.types.Comparators;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.Filter;
 import org.apache.iceberg.util.SortedMerge;
@@ -252,7 +250,6 @@ public void close() {
   }
 
   private static class DataFileFilter<T extends StructLike> extends Filter<T> {
-    private static final Comparator<CharSequence> CHARSEQ_COMPARATOR = Comparators.charSequences();
     private final CharSequence dataLocation;
 
     DataFileFilter(CharSequence dataLocation) {
@@ -261,7 +258,32 @@ private static class DataFileFilter<T extends StructLike> extends Filter<T> {
 
     @Override
     protected boolean shouldKeep(T posDelete) {
-      return CHARSEQ_COMPARATOR.compare(dataLocation, (CharSequence) FILENAME_ACCESSOR.get(posDelete)) == 0;
+      return charSeqEquals(dataLocation, (CharSequence) FILENAME_ACCESSOR.get(posDelete));
+    }
+
+    private boolean charSeqEquals(CharSequence s1, CharSequence s2) {
+      if (s1 == s2) {
+        return true;
+      }
+
+      int count = s1.length();
+      if (count != s2.length()) {
+        return false;
+      }
+
+      if (s1 instanceof String && s2 instanceof String && s1.hashCode() != s2.hashCode()) {
+        return false;
+      }
+
+      // File paths inside a delete file normally have more identical chars at the beginning. For example, a typical
+      // path is like "s3:/bucket/db/table/data/partition/00000-0-[uuid]-00001.parquet".
+      // The uuid is where the difference starts. So it's faster to find the first diff backward.
+      for (int i = count - 1; i >= 0; i--) {
+        if (s1.charAt(i) != s2.charAt(i)) {
+          return false;
+        }
+      }
+      return true;
     }
   }
 }