Merge pull request elastic#3341 from clintongormley/pattern_capture

Added the "pattern_capture" token filter from Lucene 4.4
zhj12388 · Jul 16, 2013 · 1bc8f82 · 1bc8f82
2 parents 933fd50 + 16e137e
commit 1bc8f82
Show file tree

Hide file tree

Showing 7 changed files with 412 additions and 6 deletions.
diff --git a/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java b/src/main/java/org/apache/lucene/analysis/pattern/XPatternCaptureGroupTokenFilter.java
@@ -0,0 +1,200 @@
+package org.apache.lucene.analysis.pattern;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;
+
+/**
+ * CaptureGroup uses Java regexes to emit multiple tokens - one for each capture
+ * group in one or more patterns.
+ *
+ * <p>
+ * For example, a pattern like:
+ * </p>
+ *
+ * <p>
+ * <code>"(https?://([a-zA-Z\-_0-9.]+))"</code>
+ * </p>
+ *
+ * <p>
+ * when matched against the string "http://www.foo.com/index" would return the
+ * tokens "https://www.foo.com" and "www.foo.com".
+ * </p>
+ *
+ * <p>
+ * If none of the patterns match, or if preserveOriginal is true, the original
+ * token will be preserved.
+ * </p>
+ * <p>
+ * Each pattern is matched as often as it can be, so the pattern
+ * <code> "(...)"</code>, when matched against <code>"abcdefghi"</code> would
+ * produce <code>["abc","def","ghi"]</code>
+ * </p>
+ * <p>
+ * A camelCaseFilter could be written as:
+ * </p>
+ * <p>
+ * <code>
+ *   "([A-Z]{2,})",                                 <br />
+ *   "(?&lt;![A-Z])([A-Z][a-z]+)",                     <br />
+ *   "(?:^|\\b|(?&lt;=[0-9_])|(?&lt;=[A-Z]{2}))([a-z]+)", <br />
+ *   "([0-9]+)"
+ * </code>
+ * </p>
+ * <p>
+ * plus if {@link #preserveOriginal} is true, it would also return
+ * <code>"camelCaseFilter</code>
+ * </p>
+ */
+public final class XPatternCaptureGroupTokenFilter extends TokenFilter {
+
+  private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
+  private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
+  private State state;
+  private final Matcher[] matchers;
+  private final CharsRef spare = new CharsRef();
+  private final int[] groupCounts;
+  private final boolean preserveOriginal;
+  private int[] currentGroup;
+  private int currentMatcher;
+
+  /**
+   * @param input
+   *          the input {@link TokenStream}
+   * @param preserveOriginal
+   *          set to true to return the original token even if one of the
+   *          patterns matches
+   * @param patterns
+   *          an array of {@link Pattern} objects to match against each token
+   */
+
+  public XPatternCaptureGroupTokenFilter(TokenStream input,
+      boolean preserveOriginal, Pattern... patterns) {
+    super(input);
+    this.preserveOriginal = preserveOriginal;
+    this.matchers = new Matcher[patterns.length];
+    this.groupCounts = new int[patterns.length];
+    this.currentGroup = new int[patterns.length];
+    for (int i = 0; i < patterns.length; i++) {
+      this.matchers[i] = patterns[i].matcher("");
+      this.groupCounts[i] = this.matchers[i].groupCount();
+      this.currentGroup[i] = -1;
+    }
+  }
+
+  private boolean nextCapture() {
+    int min_offset = Integer.MAX_VALUE;
+    currentMatcher = -1;
+    Matcher matcher;
+
+    for (int i = 0; i < matchers.length; i++) {
+      matcher = matchers[i];
+      if (currentGroup[i] == -1) {
+        currentGroup[i] = matcher.find() ? 1 : 0;
+      }
+      if (currentGroup[i] != 0) {
+        while (currentGroup[i] < groupCounts[i] + 1) {
+          final int start = matcher.start(currentGroup[i]);
+          final int end = matcher.end(currentGroup[i]);
+          if (start == end || preserveOriginal && start == 0
+              && spare.length == end) {
+            currentGroup[i]++;
+            continue;
+          }
+          if (start < min_offset) {
+            min_offset = start;
+            currentMatcher = i;
+          }
+          break;
+        }
+        if (currentGroup[i] == groupCounts[i] + 1) {
+          currentGroup[i] = -1;
+          i--;
+        }
+      }
+    }
+    return currentMatcher != -1;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    if (currentMatcher != -1 && nextCapture()) {
+      assert state != null;
+      clearAttributes();
+      restoreState(state);
+      final int start = matchers[currentMatcher]
+          .start(currentGroup[currentMatcher]);
+      final int end = matchers[currentMatcher]
+          .end(currentGroup[currentMatcher]);
+
+      posAttr.setPositionIncrement(0);
+      charTermAttr.copyBuffer(spare.chars, start, end - start);
+      currentGroup[currentMatcher]++;
+      return true;
+    }
+
+    if (!input.incrementToken()) {
+      return false;
+    }
+
+    char[] buffer = charTermAttr.buffer();
+    int length = charTermAttr.length();
+    spare.copyChars(buffer, 0, length);
+    state = captureState();
+
+    for (int i = 0; i < matchers.length; i++) {
+      matchers[i].reset(spare);
+      currentGroup[i] = -1;
+    }
+
+    if (preserveOriginal) {
+      currentMatcher = 0;
+    } else if (nextCapture()) {
+      final int start = matchers[currentMatcher]
+          .start(currentGroup[currentMatcher]);
+      final int end = matchers[currentMatcher]
+          .end(currentGroup[currentMatcher]);
+
+      // if we start at 0 we can simply set the length and save the copy
+      if (start == 0) {
+        charTermAttr.setLength(end);
+      } else {
+        charTermAttr.copyBuffer(spare.chars, start, end - start);
+      }
+      currentGroup[currentMatcher]++;
+    }
+    return true;
+
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    state = null;
+    currentMatcher = -1;
+  }
+
+}
diff --git a/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java b/src/main/java/org/elasticsearch/common/settings/ImmutableSettings.java
@@ -352,19 +352,28 @@ private <T> Class<? extends T> loadClass(String prefixValue, String sValue, Stri
 
     @Override
     public String[] getAsArray(String settingPrefix) throws SettingsException {
-        return getAsArray(settingPrefix, Strings.EMPTY_ARRAY);
+        return getAsArray(settingPrefix, Strings.EMPTY_ARRAY, true);
     }
 
     @Override
     public String[] getAsArray(String settingPrefix, String[] defaultArray) throws SettingsException {
+        return getAsArray(settingPrefix, defaultArray, true);
+    }
+
+    @Override
+    public String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException {
         List<String> result = Lists.newArrayList();
 
         if (get(settingPrefix) != null) {
-            String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix));
-            if (strings.length > 0) {
-                for (String string : strings) {
-                    result.add(string.trim());
+            if (commaDelimited) {
+                String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix));
+                if (strings.length > 0) {
+                    for (String string : strings) {
+                        result.add(string.trim());
+                    }
                 }
+            } else {
+                result.add(get(settingPrefix).trim());
             }
         }
 

diff --git a/src/main/java/org/elasticsearch/common/settings/Settings.java b/src/main/java/org/elasticsearch/common/settings/Settings.java
@@ -234,6 +234,21 @@ public interface Settings {
      * the numbered format.
      *
      * @param settingPrefix The setting prefix to load the array by
+     * @param defaultArray The default array to use if no value is specified
+     * @param commaDelimited Whether to try to parse a string as a comma-delimited value
+     * @return The setting array values
+     * @throws SettingsException
+     */
+    String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException;
+
+    /**
+     * The values associated with a setting prefix as an array. The settings array is in the format of:
+     * <tt>settingPrefix.[index]</tt>.
+     * <p/>
+     * <p>If commaDelimited is true, it will automatically load a comma separated list under the settingPrefix and merge with
+     * the numbered format.
+     *
+     * @param settingPrefix The setting prefix to load the array by
      * @return The setting array values
      * @throws SettingsException
      */
@@ -253,7 +268,7 @@ public interface Settings {
     String[] getAsArray(String settingPrefix) throws SettingsException;
 
     /**
-     * Retruns a parsed version.
+     * Returns a parsed version.
      */
     Version getAsVersion(String setting, Version defaultVersion) throws SettingsException;
 

diff --git a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@@ -485,6 +485,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
             tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
 
+            tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("hyphenation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);

diff --git a/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/PatternCaptureGroupTokenFilterFactory.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pattern.XPatternCaptureGroupTokenFilter;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.regex.Pattern;
+
+@AnalysisSettingsRequired
+public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFactory {
+    private Pattern[] patterns;
+    private boolean preserveOriginal;
+
+    static {
+        // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1471347.
+        assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
+    }
+
+
+    @Inject
+    public PatternCaptureGroupTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name,
+            @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        String[] regexes = settings.getAsArray("patterns",Strings.EMPTY_ARRAY,false);
+        patterns = new Pattern[regexes.length];
+        for (int i = 0; i < regexes.length; i++) {
+            patterns[i] = Pattern.compile(regexes[i]);
+        }
+
+        preserveOriginal = settings.getAsBoolean("preserve_original", true);
+    }
+
+    @Override
+    public XPatternCaptureGroupTokenFilter create(TokenStream tokenStream) {
+        return new XPatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
+    }
+}