forked from apache/jackrabbit-oak
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
OAK-10589 - Improve regex path filtering to also handle cases where e…
…xcludedPaths are defined (apache#1258)
- Loading branch information
Showing
4 changed files
with
490 additions
and
109 deletions.
There are no files selected for viewing
149 changes: 149 additions & 0 deletions
149
...jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined; | ||
|
||
import org.apache.jackrabbit.oak.commons.PathUtils; | ||
import org.apache.jackrabbit.oak.spi.filter.PathFilter; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Objects; | ||
import java.util.TreeSet; | ||
|
||
public class MongoRegexPathFilterFactory { | ||
|
||
public static class MongoFilterPaths { | ||
// Special value that means "download everything". This is used when regex path filtering is disabled or when | ||
// the path filters would require downloading everything. When this is returned, it is preferable to do not | ||
// use any filter in the Mongo query, even though using the values in this object as filters would also result | ||
// in downloading everything | ||
public static final MongoFilterPaths DOWNLOAD_ALL = new MongoFilterPaths(List.of("/"), List.of()); | ||
|
||
final List<String> included; | ||
final List<String> excluded; | ||
|
||
public MongoFilterPaths(List<String> included, List<String> excluded) { | ||
this.included = included; | ||
this.excluded = excluded; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "MongoFilterPaths{" + | ||
"included=" + included + | ||
", excluded=" + excluded + | ||
'}'; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
MongoFilterPaths that = (MongoFilterPaths) o; | ||
return Objects.equals(included, that.included) && Objects.equals(excluded, that.excluded); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(included, excluded); | ||
} | ||
} | ||
|
||
private final static Logger LOG = LoggerFactory.getLogger(MongoRegexPathFilterFactory.class); | ||
|
||
private final int maxNumberOfPaths; | ||
|
||
public MongoRegexPathFilterFactory(int maxNumberOfPaths) { | ||
this.maxNumberOfPaths = maxNumberOfPaths; | ||
} | ||
|
||
public MongoFilterPaths buildMongoFilter(List<PathFilter> pathFilters) { | ||
if (pathFilters == null || pathFilters.isEmpty()) { | ||
return MongoFilterPaths.DOWNLOAD_ALL; | ||
} | ||
// Merge and deduplicate the included and excluded paths of all filters. | ||
// The included paths will also be further de-deuplicated, by removing paths that are children of other path in | ||
// the list, that is, by keeping only the top level paths. To make it easy to de-duplicate, we sort the paths | ||
// alphabetically. | ||
// The list of excluded paths does not need to be sorted, but we do it anyway for consistency when logging the | ||
// list of paths. | ||
TreeSet<String> sortedCandidateIncludedPaths = new TreeSet<>(); | ||
TreeSet<String> candidateExcludedPaths = new TreeSet<>(); | ||
for (PathFilter pathFilter : pathFilters) { | ||
sortedCandidateIncludedPaths.addAll(pathFilter.getIncludedPaths()); | ||
candidateExcludedPaths.addAll(pathFilter.getExcludedPaths()); | ||
} | ||
// Keep only unique included paths. That is, if paths "/a/b" and "/a/b/c" are both in the list, keep only "/a/b" | ||
List<String> finalIncludedPathsRoots = new ArrayList<>(); | ||
LOG.debug("Candidate included paths: {}, candidate excluded paths: {}", sortedCandidateIncludedPaths, candidateExcludedPaths); | ||
// The quadratic algorithm below could easily be replaced by linear time algorithm, but it's not worth the added | ||
// complexity because the list of paths should never be more than a few tens | ||
// Idea for linear time algorithm: after adding a path to the includedPathsRoots, advance in the list of candidates | ||
// skipping all paths that are children of the path just added. Since the list is sorted alphabetically, the children | ||
// of a path will be next to it in the list | ||
for (String candidateIncludedPath : sortedCandidateIncludedPaths) { | ||
if (finalIncludedPathsRoots.stream().noneMatch(ancestor -> PathUtils.isAncestor(ancestor, candidateIncludedPath))) { | ||
finalIncludedPathsRoots.add(candidateIncludedPath); | ||
} | ||
} | ||
|
||
List<String> finalExcludedPaths = new ArrayList<>(); | ||
// Keep an excluded path only if it is not needed by any filter. | ||
for (String candidateExcludedPath : candidateExcludedPaths) { | ||
boolean notNeeded = pathFilters.stream().allMatch(pathFilter -> pathFilter.filter(candidateExcludedPath) == PathFilter.Result.EXCLUDE); | ||
if (notNeeded) { | ||
finalExcludedPaths.add(candidateExcludedPath); | ||
} | ||
} | ||
|
||
// Do not filter if there are too many includedPaths. The official Mongo documentation recommends | ||
// not using more than tens of values in the $in filter to avoid performance degradation. | ||
// https://www.mongodb.com/docs/manual/reference/operator/query/in/ | ||
if (finalIncludedPathsRoots.size() > maxNumberOfPaths) { | ||
LOG.info("Number of includedPaths ({}) exceed the maximum allowed ({}), downloading everything", | ||
finalIncludedPathsRoots.size(), maxNumberOfPaths); | ||
return MongoFilterPaths.DOWNLOAD_ALL; | ||
} | ||
|
||
// Also check the number of excluded paths and disable filtering by them if there are too many. | ||
// We do not need to revert to download the full repository in the case where there is a reasonable number of | ||
// included paths but too many excluded paths. The excluded paths are always children of the included paths, so | ||
// if we do not filter on excludedPaths, we simply download everything inside the included paths. This might be | ||
// much less than the full repository. | ||
if (finalExcludedPaths.size() > maxNumberOfPaths) { | ||
LOG.info("Number of excludedPaths ({}) exceed the maximum allowed ({}), disabling filtering of excludedPaths", | ||
finalExcludedPaths.size(), maxNumberOfPaths); | ||
finalExcludedPaths = List.of(); | ||
} | ||
|
||
if (finalExcludedPaths.isEmpty()) { | ||
// Only included paths | ||
if (finalIncludedPathsRoots.contains("/")) { | ||
return MongoFilterPaths.DOWNLOAD_ALL; | ||
} else { | ||
return new MongoFilterPaths(finalIncludedPathsRoots, List.of()); | ||
} | ||
} else { | ||
return new MongoFilterPaths(finalIncludedPathsRoots, finalExcludedPaths); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.