Skip to content

Commit

Permalink
OAK-10589 - Improve regex path filtering to also handle cases where e…
Browse files Browse the repository at this point in the history
…xcludedPaths are defined (apache#1258)
  • Loading branch information
nfsantos authored Jan 8, 2024
1 parent f62ae95 commit 71973a9
Show file tree
Hide file tree
Showing 4 changed files with 490 additions and 109 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;

import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.spi.filter.PathFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.TreeSet;

public class MongoRegexPathFilterFactory {

public static class MongoFilterPaths {
// Special value that means "download everything". This is used when regex path filtering is disabled or when
// the path filters would require downloading everything. When this is returned, it is preferable to do not
// use any filter in the Mongo query, even though using the values in this object as filters would also result
// in downloading everything
public static final MongoFilterPaths DOWNLOAD_ALL = new MongoFilterPaths(List.of("/"), List.of());

final List<String> included;
final List<String> excluded;

public MongoFilterPaths(List<String> included, List<String> excluded) {
this.included = included;
this.excluded = excluded;
}

@Override
public String toString() {
return "MongoFilterPaths{" +
"included=" + included +
", excluded=" + excluded +
'}';
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
MongoFilterPaths that = (MongoFilterPaths) o;
return Objects.equals(included, that.included) && Objects.equals(excluded, that.excluded);
}

@Override
public int hashCode() {
return Objects.hash(included, excluded);
}
}

private final static Logger LOG = LoggerFactory.getLogger(MongoRegexPathFilterFactory.class);

private final int maxNumberOfPaths;

public MongoRegexPathFilterFactory(int maxNumberOfPaths) {
this.maxNumberOfPaths = maxNumberOfPaths;
}

public MongoFilterPaths buildMongoFilter(List<PathFilter> pathFilters) {
if (pathFilters == null || pathFilters.isEmpty()) {
return MongoFilterPaths.DOWNLOAD_ALL;
}
// Merge and deduplicate the included and excluded paths of all filters.
// The included paths will also be further de-deuplicated, by removing paths that are children of other path in
// the list, that is, by keeping only the top level paths. To make it easy to de-duplicate, we sort the paths
// alphabetically.
// The list of excluded paths does not need to be sorted, but we do it anyway for consistency when logging the
// list of paths.
TreeSet<String> sortedCandidateIncludedPaths = new TreeSet<>();
TreeSet<String> candidateExcludedPaths = new TreeSet<>();
for (PathFilter pathFilter : pathFilters) {
sortedCandidateIncludedPaths.addAll(pathFilter.getIncludedPaths());
candidateExcludedPaths.addAll(pathFilter.getExcludedPaths());
}
// Keep only unique included paths. That is, if paths "/a/b" and "/a/b/c" are both in the list, keep only "/a/b"
List<String> finalIncludedPathsRoots = new ArrayList<>();
LOG.debug("Candidate included paths: {}, candidate excluded paths: {}", sortedCandidateIncludedPaths, candidateExcludedPaths);
// The quadratic algorithm below could easily be replaced by linear time algorithm, but it's not worth the added
// complexity because the list of paths should never be more than a few tens
// Idea for linear time algorithm: after adding a path to the includedPathsRoots, advance in the list of candidates
// skipping all paths that are children of the path just added. Since the list is sorted alphabetically, the children
// of a path will be next to it in the list
for (String candidateIncludedPath : sortedCandidateIncludedPaths) {
if (finalIncludedPathsRoots.stream().noneMatch(ancestor -> PathUtils.isAncestor(ancestor, candidateIncludedPath))) {
finalIncludedPathsRoots.add(candidateIncludedPath);
}
}

List<String> finalExcludedPaths = new ArrayList<>();
// Keep an excluded path only if it is not needed by any filter.
for (String candidateExcludedPath : candidateExcludedPaths) {
boolean notNeeded = pathFilters.stream().allMatch(pathFilter -> pathFilter.filter(candidateExcludedPath) == PathFilter.Result.EXCLUDE);
if (notNeeded) {
finalExcludedPaths.add(candidateExcludedPath);
}
}

// Do not filter if there are too many includedPaths. The official Mongo documentation recommends
// not using more than tens of values in the $in filter to avoid performance degradation.
// https://www.mongodb.com/docs/manual/reference/operator/query/in/
if (finalIncludedPathsRoots.size() > maxNumberOfPaths) {
LOG.info("Number of includedPaths ({}) exceed the maximum allowed ({}), downloading everything",
finalIncludedPathsRoots.size(), maxNumberOfPaths);
return MongoFilterPaths.DOWNLOAD_ALL;
}

// Also check the number of excluded paths and disable filtering by them if there are too many.
// We do not need to revert to download the full repository in the case where there is a reasonable number of
// included paths but too many excluded paths. The excluded paths are always children of the included paths, so
// if we do not filter on excludedPaths, we simply download everything inside the included paths. This might be
// much less than the full repository.
if (finalExcludedPaths.size() > maxNumberOfPaths) {
LOG.info("Number of excludedPaths ({}) exceed the maximum allowed ({}), disabling filtering of excludedPaths",
finalExcludedPaths.size(), maxNumberOfPaths);
finalExcludedPaths = List.of();
}

if (finalExcludedPaths.isEmpty()) {
// Only included paths
if (finalIncludedPathsRoots.contains("/")) {
return MongoFilterPaths.DOWNLOAD_ALL;
} else {
return new MongoFilterPaths(finalIncludedPathsRoots, List.of());
}
} else {
return new MongoFilterPaths(finalIncludedPathsRoots, finalExcludedPaths);
}
}

}
Loading

0 comments on commit 71973a9

Please sign in to comment.