Skip to content

Commit

Permalink
Merge pull request elastic#3341 from clintongormley/pattern_capture
Browse files Browse the repository at this point in the history
Added the "pattern_capture" token filter from Lucene 4.4
  • Loading branch information
clintongormley committed Jul 16, 2013
2 parents 933fd50 + 16e137e commit 1bc8f82
Show file tree
Hide file tree
Showing 7 changed files with 412 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
package org.apache.lucene.analysis.pattern;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;

/**
* CaptureGroup uses Java regexes to emit multiple tokens - one for each capture
* group in one or more patterns.
*
* <p>
* For example, a pattern like:
* </p>
*
* <p>
* <code>"(https?://([a-zA-Z\-_0-9.]+))"</code>
* </p>
*
* <p>
* when matched against the string "http://www.foo.com/index" would return the
* tokens "https://www.foo.com" and "www.foo.com".
* </p>
*
* <p>
* If none of the patterns match, or if preserveOriginal is true, the original
* token will be preserved.
* </p>
* <p>
* Each pattern is matched as often as it can be, so the pattern
* <code> "(...)"</code>, when matched against <code>"abcdefghi"</code> would
* produce <code>["abc","def","ghi"]</code>
* </p>
* <p>
* A camelCaseFilter could be written as:
* </p>
* <p>
* <code>
* "([A-Z]{2,})", <br />
* "(?&lt;![A-Z])([A-Z][a-z]+)", <br />
* "(?:^|\\b|(?&lt;=[0-9_])|(?&lt;=[A-Z]{2}))([a-z]+)", <br />
* "([0-9]+)"
* </code>
* </p>
* <p>
* plus if {@link #preserveOriginal} is true, it would also return
* <code>"camelCaseFilter</code>
* </p>
*/
public final class XPatternCaptureGroupTokenFilter extends TokenFilter {

private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
private State state;
private final Matcher[] matchers;
private final CharsRef spare = new CharsRef();
private final int[] groupCounts;
private final boolean preserveOriginal;
private int[] currentGroup;
private int currentMatcher;

/**
* @param input
* the input {@link TokenStream}
* @param preserveOriginal
* set to true to return the original token even if one of the
* patterns matches
* @param patterns
* an array of {@link Pattern} objects to match against each token
*/

public XPatternCaptureGroupTokenFilter(TokenStream input,
boolean preserveOriginal, Pattern... patterns) {
super(input);
this.preserveOriginal = preserveOriginal;
this.matchers = new Matcher[patterns.length];
this.groupCounts = new int[patterns.length];
this.currentGroup = new int[patterns.length];
for (int i = 0; i < patterns.length; i++) {
this.matchers[i] = patterns[i].matcher("");
this.groupCounts[i] = this.matchers[i].groupCount();
this.currentGroup[i] = -1;
}
}

private boolean nextCapture() {
int min_offset = Integer.MAX_VALUE;
currentMatcher = -1;
Matcher matcher;

for (int i = 0; i < matchers.length; i++) {
matcher = matchers[i];
if (currentGroup[i] == -1) {
currentGroup[i] = matcher.find() ? 1 : 0;
}
if (currentGroup[i] != 0) {
while (currentGroup[i] < groupCounts[i] + 1) {
final int start = matcher.start(currentGroup[i]);
final int end = matcher.end(currentGroup[i]);
if (start == end || preserveOriginal && start == 0
&& spare.length == end) {
currentGroup[i]++;
continue;
}
if (start < min_offset) {
min_offset = start;
currentMatcher = i;
}
break;
}
if (currentGroup[i] == groupCounts[i] + 1) {
currentGroup[i] = -1;
i--;
}
}
}
return currentMatcher != -1;
}

@Override
public boolean incrementToken() throws IOException {

if (currentMatcher != -1 && nextCapture()) {
assert state != null;
clearAttributes();
restoreState(state);
final int start = matchers[currentMatcher]
.start(currentGroup[currentMatcher]);
final int end = matchers[currentMatcher]
.end(currentGroup[currentMatcher]);

posAttr.setPositionIncrement(0);
charTermAttr.copyBuffer(spare.chars, start, end - start);
currentGroup[currentMatcher]++;
return true;
}

if (!input.incrementToken()) {
return false;
}

char[] buffer = charTermAttr.buffer();
int length = charTermAttr.length();
spare.copyChars(buffer, 0, length);
state = captureState();

for (int i = 0; i < matchers.length; i++) {
matchers[i].reset(spare);
currentGroup[i] = -1;
}

if (preserveOriginal) {
currentMatcher = 0;
} else if (nextCapture()) {
final int start = matchers[currentMatcher]
.start(currentGroup[currentMatcher]);
final int end = matchers[currentMatcher]
.end(currentGroup[currentMatcher]);

// if we start at 0 we can simply set the length and save the copy
if (start == 0) {
charTermAttr.setLength(end);
} else {
charTermAttr.copyBuffer(spare.chars, start, end - start);
}
currentGroup[currentMatcher]++;
}
return true;

}

@Override
public void reset() throws IOException {
super.reset();
state = null;
currentMatcher = -1;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -352,19 +352,28 @@ private <T> Class<? extends T> loadClass(String prefixValue, String sValue, Stri

@Override
public String[] getAsArray(String settingPrefix) throws SettingsException {
return getAsArray(settingPrefix, Strings.EMPTY_ARRAY);
return getAsArray(settingPrefix, Strings.EMPTY_ARRAY, true);
}

@Override
public String[] getAsArray(String settingPrefix, String[] defaultArray) throws SettingsException {
return getAsArray(settingPrefix, defaultArray, true);
}

@Override
public String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException {
List<String> result = Lists.newArrayList();

if (get(settingPrefix) != null) {
String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix));
if (strings.length > 0) {
for (String string : strings) {
result.add(string.trim());
if (commaDelimited) {
String[] strings = Strings.splitStringByCommaToArray(get(settingPrefix));
if (strings.length > 0) {
for (String string : strings) {
result.add(string.trim());
}
}
} else {
result.add(get(settingPrefix).trim());
}
}

Expand Down
17 changes: 16 additions & 1 deletion src/main/java/org/elasticsearch/common/settings/Settings.java
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,21 @@ public interface Settings {
* the numbered format.
*
* @param settingPrefix The setting prefix to load the array by
* @param defaultArray The default array to use if no value is specified
* @param commaDelimited Whether to try to parse a string as a comma-delimited value
* @return The setting array values
* @throws SettingsException
*/
String[] getAsArray(String settingPrefix, String[] defaultArray, Boolean commaDelimited) throws SettingsException;

/**
* The values associated with a setting prefix as an array. The settings array is in the format of:
* <tt>settingPrefix.[index]</tt>.
* <p/>
* <p>If commaDelimited is true, it will automatically load a comma separated list under the settingPrefix and merge with
* the numbered format.
*
* @param settingPrefix The setting prefix to load the array by
* @return The setting array values
* @throws SettingsException
*/
Expand All @@ -253,7 +268,7 @@ public interface Settings {
String[] getAsArray(String settingPrefix) throws SettingsException;

/**
* Retruns a parsed version.
* Returns a parsed version.
*/
Version getAsVersion(String setting, Version defaultVersion) throws SettingsException;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);

tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("hyphenation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;


import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pattern.XPatternCaptureGroupTokenFilter;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;

import java.util.regex.Pattern;

@AnalysisSettingsRequired
public class PatternCaptureGroupTokenFilterFactory extends AbstractTokenFilterFactory {
private Pattern[] patterns;
private boolean preserveOriginal;

static {
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1471347.
assert Lucene.VERSION == Version.LUCENE_43 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this class should be removed";
}


@Inject
public PatternCaptureGroupTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name,
@Assisted Settings settings) {
super(index, indexSettings, name, settings);
String[] regexes = settings.getAsArray("patterns",Strings.EMPTY_ARRAY,false);
patterns = new Pattern[regexes.length];
for (int i = 0; i < regexes.length; i++) {
patterns[i] = Pattern.compile(regexes[i]);
}

preserveOriginal = settings.getAsBoolean("preserve_original", true);
}

@Override
public XPatternCaptureGroupTokenFilter create(TokenStream tokenStream) {
return new XPatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}
}
Loading

0 comments on commit 1bc8f82

Please sign in to comment.