You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2018/05/11 10:35:08 UTC
[1/2] lucene-solr:branch_7x: LUCENE-8273: Add ConditionalTokenFilter
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x bd2f4efb1 -> 7f13e5642
refs/heads/master 8a697ee09 -> 1ce3ebadb
LUCENE-8273: Add ConditionalTokenFilter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7f13e564
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7f13e564
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7f13e564
Branch: refs/heads/branch_7x
Commit: 7f13e5642924aa7f97e21b059553cfd6b5b56c21
Parents: bd2f4ef
Author: Alan Woodward <ro...@apache.org>
Authored: Fri May 11 11:11:21 2018 +0100
Committer: Alan Woodward <ro...@apache.org>
Committed: Fri May 11 11:34:37 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 7 +
.../lucene/analysis/custom/CustomAnalyzer.java | 202 +++++++++++++++-
.../miscellaneous/ConditionalTokenFilter.java | 149 ++++++++++++
.../ConditionalTokenFilterFactory.java | 85 +++++++
.../miscellaneous/TermExclusionFilter.java | 52 +++++
.../TermExclusionFilterFactory.java | 58 +++++
...ache.lucene.analysis.util.TokenFilterFactory | 1 +
.../lucene/analysis/core/TestRandomChains.java | 53 ++++-
.../analysis/custom/TestCustomAnalyzer.java | 44 ++++
.../TestConditionalTokenFilter.java | 233 +++++++++++++++++++
.../miscellaneous/TestTermExclusionFilter.java | 48 ++++
11 files changed, 920 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 8dc3076..e4da240 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -77,6 +77,13 @@ New Features
provided document. This allows to undelete a soft-deleted document unless it's been claimed
by a merge. (Simon Willnauer)
+* LUCENE-8273: ConditionalTokenFilter allows analysis chains to skip particular token
+ filters based on the attributes of the current token. This generalises the keyword
+ token logic currently used for stemmers and WDF. It is integrated into
+ CustomAnalyzer by using the `when` and `whenTerm` builder methods, and a new
+ TermExclusionConditionalFilter is added as an example. (Alan Woodward,
+ Robert Muir, David Smiley, Steve Rowe, Mike Sokolov)
+
Bug Fixes
* LUCENE-8266: Detect bogus tiles when creating a standard polygon
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index a697cce..72614ca 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -17,8 +17,6 @@
package org.apache.lucene.analysis.custom;
-import static org.apache.lucene.analysis.util.AnalysisSPILoader.newFactoryClassInstance;
-
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Path;
@@ -29,10 +27,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
+import java.util.function.Function;
+import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilterFactory;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
@@ -45,6 +48,8 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.Version;
+import static org.apache.lucene.analysis.util.AnalysisSPILoader.newFactoryClassInstance;
+
/**
* A general-purpose Analyzer that can be created with a builder-style API.
* Under the hood it uses the factory classes {@link TokenizerFactory},
@@ -73,6 +78,17 @@ import org.apache.lucene.util.Version;
* <p>The list of names to be used for components can be looked up through:
* {@link TokenizerFactory#availableTokenizers()}, {@link TokenFilterFactory#availableTokenFilters()},
* and {@link CharFilterFactory#availableCharFilters()}.
+ * <p>You can create conditional branches in the analyzer by using {@link Builder#when(String, String...)} and
+ * {@link Builder#whenTerm(Predicate)}:
+ * <pre class="prettyprint">
+ * Analyzer ana = CustomAnalyzer.builder()
+ * .withTokenizer("standard")
+ * .addTokenFilter("lowercase")
+ * .whenTerm(t -> t.length() > 10)
+ * .addTokenFilter("reversestring")
+ * .endwhen()
+ * .build()
+ * </pre>
*/
public final class CustomAnalyzer extends Analyzer {
@@ -335,6 +351,13 @@ public final class CustomAnalyzer extends Analyzer {
componentsAdded = true;
return this;
}
+
+ private Builder addTokenFilter(TokenFilterFactory factory) {
+ Objects.requireNonNull(factory, "TokenFilterFactory may not be null");
+ tokenFilters.add(factory);
+ componentsAdded = true;
+ return this;
+ }
/** Adds the given char filter.
* @param factory class that is used to create the char filter.
@@ -377,6 +400,117 @@ public final class CustomAnalyzer extends Analyzer {
componentsAdded = true;
return this;
}
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}
+ * @param params the parameters to be passed to the factory
+ */
+ public ConditionBuilder when(String name, String... params) throws IOException {
+ return when(name, paramsToMap(params));
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}
+ * @param params the parameters to be passed to the factory. The map must be modifiable
+ */
+ @SuppressWarnings("unchecked")
+ public ConditionBuilder when(String name, Map<String, String> params) throws IOException {
+ Class<? extends TokenFilterFactory> clazz = TokenFilterFactory.lookupClass(name);
+ if (ConditionalTokenFilterFactory.class.isAssignableFrom(clazz) == false) {
+ throw new IllegalArgumentException("TokenFilterFactory " + name + " is not a ConditionalTokenFilterFactory");
+ }
+ return when((Class<? extends ConditionalTokenFilterFactory>) clazz, params);
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param factory class that is used to create the ConditionalTokenFilter
+ * @param params the parameters to be passed to the factory
+ */
+ public ConditionBuilder when(Class<? extends ConditionalTokenFilterFactory> factory, String... params) throws IOException {
+ return when(factory, paramsToMap(params));
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param factory class that is used to create the ConditionalTokenFilter
+ * @param params the parameters to be passed to the factory. The map must be modifiable
+ */
+ public ConditionBuilder when(Class<? extends ConditionalTokenFilterFactory> factory, Map<String, String> params) throws IOException {
+ return when(newFactoryClassInstance(factory, applyDefaultParams(params)));
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ */
+ public ConditionBuilder when(ConditionalTokenFilterFactory factory) {
+ return new ConditionBuilder(factory, this);
+ }
+
+ /**
+ * Apply subsequent token filters if the current token's term matches a predicate
+ *
+ * This is the equivalent of:
+ * <pre>
+ * when(new ConditionalTokenFilterFactory(Collections.emptyMap()) {
+ * {@code @}Override
+ * protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
+ * return new ConditionalTokenFilter(input, inner) {
+ * CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ * {@code @}Override
+ * protected boolean shouldFilter() {
+ * return predicate.test(termAtt);
+ * }
+ * };
+ * }
+ * });
+ * </pre>
+ */
+ public ConditionBuilder whenTerm(Predicate<CharSequence> predicate) {
+ return new ConditionBuilder(new ConditionalTokenFilterFactory(Collections.emptyMap()) {
+ @Override
+ protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
+ return new ConditionalTokenFilter(input, inner) {
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() {
+ return predicate.test(termAtt);
+ }
+ };
+ }
+ }, this);
+ }
/** Builds the analyzer. */
public CustomAnalyzer build() {
@@ -412,11 +546,73 @@ public final class CustomAnalyzer extends Analyzer {
return map;
}
- private <T> T applyResourceLoader(T factory) throws IOException {
+ <T> T applyResourceLoader(T factory) throws IOException {
if (factory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) factory).inform(loader);
}
return factory;
}
}
+
+ /**
+ * Factory class for a {@link ConditionalTokenFilter}
+ */
+ public static class ConditionBuilder {
+
+ private final List<TokenFilterFactory> innerFilters = new ArrayList<>();
+ private final ConditionalTokenFilterFactory factory;
+ private final Builder parent;
+
+ private ConditionBuilder(ConditionalTokenFilterFactory factory, Builder parent) {
+ this.factory = factory;
+ this.parent = parent;
+ }
+
+ /** Adds the given token filter.
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
+ * The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(String name, Map<String, String> params) throws IOException {
+ innerFilters.add(TokenFilterFactory.forName(name, parent.applyDefaultParams(params)));
+ return this;
+ }
+
+ /** Adds the given token filter.
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
+ * The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(String name, String... params) throws IOException {
+ return addTokenFilter(name, parent.paramsToMap(params));
+ }
+
+ /** Adds the given token filter.
+ * @param factory class that is used to create the token filter.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(Class<? extends TokenFilterFactory> factory, Map<String, String> params) throws IOException {
+ innerFilters.add(newFactoryClassInstance(factory, parent.applyDefaultParams(params)));
+ return this;
+ }
+
+ /** Adds the given token filter.
+ * @param factory class that is used to create the token filter.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(Class<? extends TokenFilterFactory> factory, String... params) throws IOException {
+ return addTokenFilter(factory, parent.paramsToMap(params));
+ }
+
+ /**
+ * Close the branch and return to the main analysis chain
+ */
+ public Builder endwhen() throws IOException {
+ factory.setInnerFilters(innerFilters);
+ parent.applyResourceLoader(factory);
+ parent.addTokenFilter(factory);
+ return parent;
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
new file mode 100644
index 0000000..f6f2a54
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Allows skipping TokenFilters based on the current set of attributes.
+ *
+ * To use, implement the {@link #shouldFilter()} method. If it returns {@code false},
+ * then calling {@link #incrementToken()} will use the wrapped TokenFilter to
+ * make changes to the tokenstream. If it returns {@code true}, then the wrapped
+ * filter will be skipped
+ */
+public abstract class ConditionalTokenFilter extends TokenFilter {
+
+ private enum TokenState {
+ READING, PREBUFFERING, BUFFERING, DELEGATING
+ }
+
+ private final class OneTimeWrapper extends TokenStream {
+
+ public OneTimeWrapper(AttributeSource attributeSource) {
+ super(attributeSource);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (state == TokenState.PREBUFFERING) {
+ state = TokenState.BUFFERING;
+ return true;
+ }
+ if (state == TokenState.DELEGATING) {
+ return false;
+ }
+ return ConditionalTokenFilter.this.incrementToken();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ // clearing attributes etc is done by the parent stream,
+ // so must be avoided here
+ }
+
+ @Override
+ public void end() throws IOException {
+ // clearing attributes etc is done by the parent stream,
+ // so must be avoided here
+ }
+ }
+
+ private final TokenStream delegate;
+ private TokenState state = TokenState.READING;
+ private boolean lastTokenFiltered;
+
+ /**
+ * Create a new BypassingTokenFilter
+ * @param input the input TokenStream
+ * @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
+ */
+ protected ConditionalTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
+ super(input);
+ this.delegate = inputFactory.apply(new OneTimeWrapper(this));
+ }
+
+ /**
+ * Whether or not to execute the wrapped TokenFilter for the current token
+ */
+ protected abstract boolean shouldFilter() throws IOException;
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.delegate.reset();
+ this.state = TokenState.READING;
+ this.lastTokenFiltered = false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ if (lastTokenFiltered) {
+ this.delegate.end();
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ this.delegate.close();
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ while (true) {
+ if (state == TokenState.READING) {
+ if (input.incrementToken() == false) {
+ return false;
+ }
+ if (shouldFilter()) {
+ lastTokenFiltered = true;
+ state = TokenState.PREBUFFERING;
+ // we determine that the delegate has emitted all the tokens it can at the current
+ // position when OneTimeWrapper.incrementToken() is called in DELEGATING state. To
+ // signal this back to the delegate, we return false, so we now need to reset it
+ // to ensure that it can continue to emit more tokens
+ delegate.reset();
+ boolean more = delegate.incrementToken();
+ state = TokenState.DELEGATING;
+ return more;
+ }
+ lastTokenFiltered = false;
+ return true;
+ }
+ if (state == TokenState.BUFFERING) {
+ return input.incrementToken();
+ }
+ if (state == TokenState.DELEGATING) {
+ clearAttributes();
+ if (delegate.incrementToken()) {
+ return true;
+ }
+ // no more cached tokens
+ state = TokenState.READING;
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java
new file mode 100644
index 0000000..9fe27ac
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Abstract parent class for analysis factories that create {@link ConditionalTokenFilter} instances
+ */
+public abstract class ConditionalTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+
+ private List<TokenFilterFactory> innerFilters;
+
+ protected ConditionalTokenFilterFactory(Map<String, String> args) {
+ super(args);
+ }
+
+ /**
+ * Set the inner filter factories to produce the {@link TokenFilter}s that will be
+ * wrapped by the {@link ConditionalTokenFilter}
+ */
+ public void setInnerFilters(List<TokenFilterFactory> innerFilters) {
+ this.innerFilters = innerFilters;
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ if (innerFilters == null || innerFilters.size() == 0) {
+ return input;
+ }
+ Function<TokenStream, TokenStream> innerStream = ts -> {
+ for (TokenFilterFactory factory : innerFilters) {
+ ts = factory.create(ts);
+ }
+ return ts;
+ };
+ return create(input, innerStream);
+ }
+
+ @Override
+ public final void inform(ResourceLoader loader) throws IOException {
+ if (innerFilters == null)
+ return;
+ for (TokenFilterFactory factory : innerFilters) {
+ if (factory instanceof ResourceLoaderAware) {
+ ((ResourceLoaderAware)factory).inform(loader);
+ }
+ }
+ doInform(loader);
+ }
+
+ /**
+ * Initialises this component with the corresponding {@link ResourceLoader}
+ */
+ protected void doInform(ResourceLoader loader) throws IOException { }
+
+ /**
+ * Modify the incoming {@link TokenStream} with a {@link ConditionalTokenFilter}
+ */
+ protected abstract ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner);
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java
new file mode 100644
index 0000000..95bf55f
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A ConditionalTokenFilter that only applies its wrapped filters to tokens that
+ * are not contained in an exclusion set.
+ */
+public class TermExclusionFilter extends ConditionalTokenFilter {
+
+ private final CharArraySet excludeTerms;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Creates a new TermExclusionFilter
+ * @param excludeTerms the set of terms to skip the wrapped filters for
+ * @param input the input TokenStream
+ * @param inputFactory a factory function to create the wrapped filters
+ */
+ public TermExclusionFilter(final CharArraySet excludeTerms, TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
+ super(input, inputFactory);
+ this.excludeTerms = excludeTerms;
+ }
+
+ @Override
+ protected boolean shouldFilter() {
+ return excludeTerms.contains(termAtt.buffer(), 0, termAtt.length()) == false;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java
new file mode 100644
index 0000000..f860b72
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+
+/**
+ * Factory for a {@link TermExclusionFilter}
+ */
+public class TermExclusionFilterFactory extends ConditionalTokenFilterFactory {
+
+ public static final String EXCLUDED_TOKENS = "protected";
+
+ private final String wordFiles;
+ private final boolean ignoreCase;
+
+ private CharArraySet excludeTerms;
+
+ public TermExclusionFilterFactory(Map<String, String> args) {
+ super(args);
+ wordFiles = get(args, EXCLUDED_TOKENS);
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
+ return new TermExclusionFilter(excludeTerms, input, inner);
+ }
+
+ @Override
+ public void doInform(ResourceLoader loader) throws IOException {
+ excludeTerms = getWordSet(loader, wordFiles, ignoreCase);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index b46de18..ddd3c91 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -78,6 +78,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
+org.apache.lucene.analysis.miscellaneous.TermExclusionFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 7839203..bef95f8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -71,13 +71,14 @@ import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
@@ -207,6 +208,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
continue;
}
+ // conditional filters are tested elsewhere
+ if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
+ continue;
+ }
if (Tokenizer.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
@@ -696,16 +701,46 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
-
- final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
- if (broken(ctor, args)) {
- continue;
- }
- final TokenFilter flt = createComponent(ctor, args, descr);
- if (flt != null) {
- spec.stream = flt;
+ if (random.nextBoolean()) {
+ long seed = random.nextLong();
+ spec.stream = new ConditionalTokenFilter(spec.stream, in -> {
+ final Object args[] = newFilterArgs(random, in, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ return in;
+ }
+ descr.append("ConditionalTokenFilter: ");
+ TokenStream ts = createComponent(ctor, args, descr);
+ if (ts == null) {
+ return in;
+ }
+ return ts;
+ }) {
+ Random random = new Random(seed);
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ random = new Random(seed);
+ }
+
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return random.nextBoolean();
+ }
+ };
break;
}
+ else {
+ final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ continue;
+ }
+ final TokenFilter flt = createComponent(ctor, args, descr);
+ if (flt != null) {
+ spec.stream = flt;
+ break;
+ }
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index d929bfd..b963812 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -37,6 +37,7 @@ import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
+import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
@@ -500,4 +501,47 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
+ public void testConditions() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("whitespace")
+ .addTokenFilter("lowercase")
+ .whenTerm(t -> t.toString().contains("o"))
+ .addTokenFilter("uppercase")
+ .addTokenFilter(ReverseStringFilterFactory.class)
+ .endwhen()
+ .addTokenFilter("asciifolding")
+ .build();
+
+ assertAnalyzesTo(analyzer, "Héllo world whaT's hãppening",
+ new String[]{ "OLLEH", "DLROW", "what's", "happening" });
+ }
+
+ public void testConditionsWithResourceLoader() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("whitespace")
+ .addTokenFilter("lowercase")
+ .when("termexclusion", "protected", "org/apache/lucene/analysis/custom/teststop.txt")
+ .addTokenFilter("reversestring")
+ .endwhen()
+ .build();
+
+ assertAnalyzesTo(analyzer, "FOO BAR BAZ",
+ new String[]{ "foo", "bar", "zab" });
+ }
+
+ public void testConditionsWithWrappedResourceLoader() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("whitespace")
+ .addTokenFilter("lowercase")
+ .whenTerm(t -> t.toString().contains("o") == false)
+ .addTokenFilter("stop",
+ "ignoreCase", "true",
+ "words", "org/apache/lucene/analysis/custom/teststop.txt",
+ "format", "wordset")
+ .endwhen()
+ .build();
+
+ assertAnalyzesTo(analyzer, "foo bar baz", new String[]{ "foo", "baz" });
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
new file mode 100644
index 0000000..e804676
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.CharacterUtils;
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SolrSynonymParser;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
+
+ boolean closed = false;
+ boolean ended = false;
+ boolean reset = false;
+
+ private final class AssertingLowerCaseFilter extends TokenFilter {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public AssertingLowerCaseFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+ return true;
+ } else
+ return false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ ended = true;
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ closed = true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ reset = true;
+ }
+ }
+
+ public void testSimple() throws IOException {
+
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("Alice", 1, 0, 5),
+ new Token("Bob", 1, 6, 9),
+ new Token("Clara", 1, 10, 15),
+ new Token("David", 1, 16, 21)
+ );
+
+ TokenStream t = new ConditionalTokenFilter(cts, AssertingLowerCaseFilter::new) {
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return termAtt.toString().contains("o") == false;
+ }
+ };
+
+ assertTokenStreamContents(t, new String[]{ "alice", "Bob", "clara", "david" });
+ assertTrue(closed);
+ assertTrue(reset);
+ assertTrue(ended);
+ }
+
+ private final class TokenSplitter extends TokenFilter {
+
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ String half;
+
+ protected TokenSplitter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (half == null) {
+ if (input.incrementToken() == false) {
+ return false;
+ }
+ half = termAtt.toString().substring(4);
+ termAtt.setLength(4);
+ return true;
+ }
+ termAtt.setEmpty().append(half);
+ half = null;
+ return true;
+ }
+ }
+
+ public void testMultitokenWrapping() throws IOException {
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("tokenpos1", 0, 9),
+ new Token("tokenpos2", 10, 19),
+ new Token("tokenpos3", 20, 29),
+ new Token("tokenpos4", 30, 39)
+ );
+
+ TokenStream ts = new ConditionalTokenFilter(cts, TokenSplitter::new) {
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return termAtt.toString().contains("2") == false;
+ }
+ };
+
+ assertTokenStreamContents(ts, new String[]{
+ "toke", "npos1", "tokenpos2", "toke", "npos3", "toke", "npos4"
+ });
+ }
+
+ private final class EndTrimmingFilter extends FilteringTokenFilter {
+
+ final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ public EndTrimmingFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ protected boolean accept() throws IOException {
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ offsetAtt.setOffset(0, offsetAtt.endOffset() - 2);
+ }
+ }
+
+ public void testEndPropagation() throws IOException {
+ CannedTokenStream cts1 = new CannedTokenStream(0, 20,
+ new Token("alice", 0, 5), new Token("bob", 6, 8)
+ );
+ TokenStream ts1 = new ConditionalTokenFilter(cts1, EndTrimmingFilter::new) {
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return false;
+ }
+ };
+ assertTokenStreamContents(ts1, new String[]{ "alice", "bob" },
+ null, null, null, null, null, 20);
+
+ CannedTokenStream cts2 = new CannedTokenStream(0, 20,
+ new Token("alice", 0, 5), new Token("bob", 6, 8)
+ );
+ TokenStream ts2 = new ConditionalTokenFilter(cts2, EndTrimmingFilter::new) {
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return true;
+ }
+ };
+ assertTokenStreamContents(ts2, new String[]{ "alice", "bob" },
+ null, null, null, null, null, 18);
+ }
+
+ public void testWrapGraphs() throws Exception {
+
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("a", 0, 1),
+ new Token("b", 2, 3),
+ new Token("c", 4, 5),
+ new Token("d", 6, 7),
+ new Token("e", 8, 9)
+ );
+
+ SynonymMap sm;
+ try (Analyzer analyzer = new MockAnalyzer(random())) {
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
+ parser.parse(new StringReader("a b, f\nc d, g"));
+ sm = parser.build();
+ }
+
+ TokenStream ts = new ConditionalTokenFilter(cts, in -> new SynonymGraphFilter(in, sm, true)) {
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return "c".equals(termAtt.toString()) == false;
+ }
+ };
+
+ assertTokenStreamContents(ts, new String[]{
+ "f", "a", "b", "c", "d", "e"
+ },
+ null, null, null,
+ new int[]{
+ 1, 0, 1, 1, 1, 1
+ },
+ new int[]{
+ 2, 1, 1, 1, 1, 1
+ });
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7f13e564/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java
new file mode 100644
index 0000000..2e6aecf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+
+public class TestTermExclusionFilter extends BaseTokenStreamTestCase {
+
+ public void testExcludeTerms() throws IOException {
+
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("Alice", 1, 0, 5),
+ new Token("Bob", 1, 6, 9),
+ new Token("Clara", 1, 10, 15),
+ new Token("David", 1, 16, 21)
+ );
+
+ CharArraySet exclusions = new CharArraySet(5, true);
+ exclusions.add("bob");
+
+ TokenStream ts = new TermExclusionFilter(exclusions, cts, LowerCaseFilter::new);
+ assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });
+
+ }
+
+}
[2/2] lucene-solr:master: LUCENE-8273: Add ConditionalTokenFilter
Posted by ro...@apache.org.
LUCENE-8273: Add ConditionalTokenFilter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/1ce3ebad
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/1ce3ebad
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/1ce3ebad
Branch: refs/heads/master
Commit: 1ce3ebadbd60d4485145c71b48330d0aabde77ad
Parents: 8a697ee
Author: Alan Woodward <ro...@apache.org>
Authored: Fri May 11 11:11:21 2018 +0100
Committer: Alan Woodward <ro...@apache.org>
Committed: Fri May 11 11:34:59 2018 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 7 +
.../lucene/analysis/custom/CustomAnalyzer.java | 202 +++++++++++++++-
.../miscellaneous/ConditionalTokenFilter.java | 149 ++++++++++++
.../ConditionalTokenFilterFactory.java | 85 +++++++
.../miscellaneous/TermExclusionFilter.java | 52 +++++
.../TermExclusionFilterFactory.java | 58 +++++
...ache.lucene.analysis.util.TokenFilterFactory | 1 +
.../lucene/analysis/core/TestRandomChains.java | 53 ++++-
.../analysis/custom/TestCustomAnalyzer.java | 44 ++++
.../TestConditionalTokenFilter.java | 233 +++++++++++++++++++
.../miscellaneous/TestTermExclusionFilter.java | 48 ++++
11 files changed, 920 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 50e2845..de790e1 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -175,6 +175,13 @@ New Features
provided document. This allows to undelete a soft-deleted document unless it's been claimed
by a merge. (Simon Willnauer)
+* LUCENE-8273: ConditionalTokenFilter allows analysis chains to skip particular token
+ filters based on the attributes of the current token. This generalises the keyword
+ token logic currently used for stemmers and WDF. It is integrated into
+ CustomAnalyzer by using the `when` and `whenTerm` builder methods, and a new
+ TermExclusionConditionalFilter is added as an example. (Alan Woodward,
+ Robert Muir, David Smiley, Steve Rowe, Mike Sokolov)
+
Bug Fixes
* LUCENE-8266: Detect bogus tiles when creating a standard polygon and
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index a697cce..72614ca 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -17,8 +17,6 @@
package org.apache.lucene.analysis.custom;
-import static org.apache.lucene.analysis.util.AnalysisSPILoader.newFactoryClassInstance;
-
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Path;
@@ -29,10 +27,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
+import java.util.function.Function;
+import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilterFactory;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
@@ -45,6 +48,8 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.SetOnce;
import org.apache.lucene.util.Version;
+import static org.apache.lucene.analysis.util.AnalysisSPILoader.newFactoryClassInstance;
+
/**
* A general-purpose Analyzer that can be created with a builder-style API.
* Under the hood it uses the factory classes {@link TokenizerFactory},
@@ -73,6 +78,17 @@ import org.apache.lucene.util.Version;
* <p>The list of names to be used for components can be looked up through:
* {@link TokenizerFactory#availableTokenizers()}, {@link TokenFilterFactory#availableTokenFilters()},
* and {@link CharFilterFactory#availableCharFilters()}.
+ * <p>You can create conditional branches in the analyzer by using {@link Builder#when(String, String...)} and
+ * {@link Builder#whenTerm(Predicate)}:
+ * <pre class="prettyprint">
+ * Analyzer ana = CustomAnalyzer.builder()
+ * .withTokenizer("standard")
+ * .addTokenFilter("lowercase")
+ * .whenTerm(t -> t.length() > 10)
+ * .addTokenFilter("reversestring")
+ * .endwhen()
+ * .build()
+ * </pre>
*/
public final class CustomAnalyzer extends Analyzer {
@@ -335,6 +351,13 @@ public final class CustomAnalyzer extends Analyzer {
componentsAdded = true;
return this;
}
+
+ private Builder addTokenFilter(TokenFilterFactory factory) {
+ Objects.requireNonNull(factory, "TokenFilterFactory may not be null");
+ tokenFilters.add(factory);
+ componentsAdded = true;
+ return this;
+ }
/** Adds the given char filter.
* @param factory class that is used to create the char filter.
@@ -377,6 +400,117 @@ public final class CustomAnalyzer extends Analyzer {
componentsAdded = true;
return this;
}
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}
+ * @param params the parameters to be passed to the factory
+ */
+ public ConditionBuilder when(String name, String... params) throws IOException {
+ return when(name, paramsToMap(params));
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}
+ * @param params the parameters to be passed to the factory. The map must be modifiable
+ */
+ @SuppressWarnings("unchecked")
+ public ConditionBuilder when(String name, Map<String, String> params) throws IOException {
+ Class<? extends TokenFilterFactory> clazz = TokenFilterFactory.lookupClass(name);
+ if (ConditionalTokenFilterFactory.class.isAssignableFrom(clazz) == false) {
+ throw new IllegalArgumentException("TokenFilterFactory " + name + " is not a ConditionalTokenFilterFactory");
+ }
+ return when((Class<? extends ConditionalTokenFilterFactory>) clazz, params);
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param factory class that is used to create the ConditionalTokenFilter
+ * @param params the parameters to be passed to the factory
+ */
+ public ConditionBuilder when(Class<? extends ConditionalTokenFilterFactory> factory, String... params) throws IOException {
+ return when(factory, paramsToMap(params));
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ *
+ * @param factory class that is used to create the ConditionalTokenFilter
+ * @param params the parameters to be passed to the factory. The map must be modifiable
+ */
+ public ConditionBuilder when(Class<? extends ConditionalTokenFilterFactory> factory, Map<String, String> params) throws IOException {
+ return when(newFactoryClassInstance(factory, applyDefaultParams(params)));
+ }
+
+ /**
+ * Add a {@link ConditionalTokenFilterFactory} to the analysis chain
+ *
+ * TokenFilters added by subsequent calls to {@link ConditionBuilder#addTokenFilter(String, String...)}
+ * and related functions will only be used if the current token matches the condition. Consumers
+ * must call {@link ConditionBuilder#endwhen()} to return to the normal tokenfilter
+ * chain once conditional filters have been added
+ */
+ public ConditionBuilder when(ConditionalTokenFilterFactory factory) {
+ return new ConditionBuilder(factory, this);
+ }
+
+ /**
+ * Apply subsequent token filters if the current token's term matches a predicate
+ *
+ * This is the equivalent of:
+ * <pre>
+ * when(new ConditionalTokenFilterFactory(Collections.emptyMap()) {
+ * {@code @}Override
+ * protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
+ * return new ConditionalTokenFilter(input, inner) {
+ * CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ * {@code @}Override
+ * protected boolean shouldFilter() {
+ * return predicate.test(termAtt);
+ * }
+ * };
+ * }
+ * });
+ * </pre>
+ */
+ public ConditionBuilder whenTerm(Predicate<CharSequence> predicate) {
+ return new ConditionBuilder(new ConditionalTokenFilterFactory(Collections.emptyMap()) {
+ @Override
+ protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
+ return new ConditionalTokenFilter(input, inner) {
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() {
+ return predicate.test(termAtt);
+ }
+ };
+ }
+ }, this);
+ }
/** Builds the analyzer. */
public CustomAnalyzer build() {
@@ -412,11 +546,73 @@ public final class CustomAnalyzer extends Analyzer {
return map;
}
- private <T> T applyResourceLoader(T factory) throws IOException {
+ <T> T applyResourceLoader(T factory) throws IOException {
if (factory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) factory).inform(loader);
}
return factory;
}
}
+
+ /**
+ * Factory class for a {@link ConditionalTokenFilter}
+ */
+ public static class ConditionBuilder {
+
+ private final List<TokenFilterFactory> innerFilters = new ArrayList<>();
+ private final ConditionalTokenFilterFactory factory;
+ private final Builder parent;
+
+ private ConditionBuilder(ConditionalTokenFilterFactory factory, Builder parent) {
+ this.factory = factory;
+ this.parent = parent;
+ }
+
+ /** Adds the given token filter.
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
+ * The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(String name, Map<String, String> params) throws IOException {
+ innerFilters.add(TokenFilterFactory.forName(name, parent.applyDefaultParams(params)));
+ return this;
+ }
+
+ /** Adds the given token filter.
+ * @param name is used to look up the factory with {@link TokenFilterFactory#forName(String, Map)}.
+ * The list of possible names can be looked up with {@link TokenFilterFactory#availableTokenFilters()}.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(String name, String... params) throws IOException {
+ return addTokenFilter(name, parent.paramsToMap(params));
+ }
+
+ /** Adds the given token filter.
+ * @param factory class that is used to create the token filter.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(Class<? extends TokenFilterFactory> factory, Map<String, String> params) throws IOException {
+ innerFilters.add(newFactoryClassInstance(factory, parent.applyDefaultParams(params)));
+ return this;
+ }
+
+ /** Adds the given token filter.
+ * @param factory class that is used to create the token filter.
+ * @param params the map of parameters to be passed to factory. The map must be modifiable.
+ */
+ public ConditionBuilder addTokenFilter(Class<? extends TokenFilterFactory> factory, String... params) throws IOException {
+ return addTokenFilter(factory, parent.paramsToMap(params));
+ }
+
+ /**
+ * Close the branch and return to the main analysis chain
+ */
+ public Builder endwhen() throws IOException {
+ factory.setInnerFilters(innerFilters);
+ parent.applyResourceLoader(factory);
+ parent.addTokenFilter(factory);
+ return parent;
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
new file mode 100644
index 0000000..f6f2a54
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilter.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Allows skipping TokenFilters based on the current set of attributes.
+ *
+ * To use, implement the {@link #shouldFilter()} method. If it returns {@code false},
+ * then calling {@link #incrementToken()} will use the wrapped TokenFilter to
+ * make changes to the tokenstream. If it returns {@code true}, then the wrapped
+ * filter will be skipped
+ */
+public abstract class ConditionalTokenFilter extends TokenFilter {
+
+ private enum TokenState {
+ READING, PREBUFFERING, BUFFERING, DELEGATING
+ }
+
+ private final class OneTimeWrapper extends TokenStream {
+
+ public OneTimeWrapper(AttributeSource attributeSource) {
+ super(attributeSource);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (state == TokenState.PREBUFFERING) {
+ state = TokenState.BUFFERING;
+ return true;
+ }
+ if (state == TokenState.DELEGATING) {
+ return false;
+ }
+ return ConditionalTokenFilter.this.incrementToken();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ // clearing attributes etc is done by the parent stream,
+ // so must be avoided here
+ }
+
+ @Override
+ public void end() throws IOException {
+ // clearing attributes etc is done by the parent stream,
+ // so must be avoided here
+ }
+ }
+
+ private final TokenStream delegate;
+ private TokenState state = TokenState.READING;
+ private boolean lastTokenFiltered;
+
+ /**
+ * Create a new BypassingTokenFilter
+ * @param input the input TokenStream
+ * @param inputFactory a factory function to create a new instance of the TokenFilter to wrap
+ */
+ protected ConditionalTokenFilter(TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
+ super(input);
+ this.delegate = inputFactory.apply(new OneTimeWrapper(this));
+ }
+
+ /**
+ * Whether or not to execute the wrapped TokenFilter for the current token
+ */
+ protected abstract boolean shouldFilter() throws IOException;
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.delegate.reset();
+ this.state = TokenState.READING;
+ this.lastTokenFiltered = false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ if (lastTokenFiltered) {
+ this.delegate.end();
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ this.delegate.close();
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ while (true) {
+ if (state == TokenState.READING) {
+ if (input.incrementToken() == false) {
+ return false;
+ }
+ if (shouldFilter()) {
+ lastTokenFiltered = true;
+ state = TokenState.PREBUFFERING;
+ // we determine that the delegate has emitted all the tokens it can at the current
+ // position when OneTimeWrapper.incrementToken() is called in DELEGATING state. To
+ // signal this back to the delegate, we return false, so we now need to reset it
+ // to ensure that it can continue to emit more tokens
+ delegate.reset();
+ boolean more = delegate.incrementToken();
+ state = TokenState.DELEGATING;
+ return more;
+ }
+ lastTokenFiltered = false;
+ return true;
+ }
+ if (state == TokenState.BUFFERING) {
+ return input.incrementToken();
+ }
+ if (state == TokenState.DELEGATING) {
+ clearAttributes();
+ if (delegate.incrementToken()) {
+ return true;
+ }
+ // no more cached tokens
+ state = TokenState.READING;
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java
new file mode 100644
index 0000000..9fe27ac
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ConditionalTokenFilterFactory.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Abstract parent class for analysis factories that create {@link ConditionalTokenFilter} instances
+ */
+public abstract class ConditionalTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+
+ private List<TokenFilterFactory> innerFilters;
+
+ protected ConditionalTokenFilterFactory(Map<String, String> args) {
+ super(args);
+ }
+
+ /**
+ * Set the inner filter factories to produce the {@link TokenFilter}s that will be
+ * wrapped by the {@link ConditionalTokenFilter}
+ */
+ public void setInnerFilters(List<TokenFilterFactory> innerFilters) {
+ this.innerFilters = innerFilters;
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ if (innerFilters == null || innerFilters.size() == 0) {
+ return input;
+ }
+ Function<TokenStream, TokenStream> innerStream = ts -> {
+ for (TokenFilterFactory factory : innerFilters) {
+ ts = factory.create(ts);
+ }
+ return ts;
+ };
+ return create(input, innerStream);
+ }
+
+ @Override
+ public final void inform(ResourceLoader loader) throws IOException {
+ if (innerFilters == null)
+ return;
+ for (TokenFilterFactory factory : innerFilters) {
+ if (factory instanceof ResourceLoaderAware) {
+ ((ResourceLoaderAware)factory).inform(loader);
+ }
+ }
+ doInform(loader);
+ }
+
+ /**
+ * Initialises this component with the corresponding {@link ResourceLoader}
+ */
+ protected void doInform(ResourceLoader loader) throws IOException { }
+
+ /**
+ * Modify the incoming {@link TokenStream} with a {@link ConditionalTokenFilter}
+ */
+ protected abstract ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner);
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java
new file mode 100644
index 0000000..95bf55f
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilter.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A ConditionalTokenFilter that only applies its wrapped filters to tokens that
+ * are not contained in an exclusion set.
+ */
+public class TermExclusionFilter extends ConditionalTokenFilter {
+
+ private final CharArraySet excludeTerms;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ /**
+ * Creates a new TermExclusionFilter
+ * @param excludeTerms the set of terms to skip the wrapped filters for
+ * @param input the input TokenStream
+ * @param inputFactory a factory function to create the wrapped filters
+ */
+ public TermExclusionFilter(final CharArraySet excludeTerms, TokenStream input, Function<TokenStream, TokenStream> inputFactory) {
+ super(input, inputFactory);
+ this.excludeTerms = excludeTerms;
+ }
+
+ @Override
+ protected boolean shouldFilter() {
+ return excludeTerms.contains(termAtt.buffer(), 0, termAtt.length()) == false;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java
new file mode 100644
index 0000000..f860b72
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TermExclusionFilterFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.ResourceLoader;
+
+/**
+ * Factory for a {@link TermExclusionFilter}
+ */
+public class TermExclusionFilterFactory extends ConditionalTokenFilterFactory {
+
+ public static final String EXCLUDED_TOKENS = "protected";
+
+ private final String wordFiles;
+ private final boolean ignoreCase;
+
+ private CharArraySet excludeTerms;
+
+ public TermExclusionFilterFactory(Map<String, String> args) {
+ super(args);
+ wordFiles = get(args, EXCLUDED_TOKENS);
+ ignoreCase = getBoolean(args, "ignoreCase", false);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
+ return new TermExclusionFilter(excludeTerms, input, inner);
+ }
+
+ @Override
+ public void doInform(ResourceLoader loader) throws IOException {
+ excludeTerms = getWordSet(loader, wordFiles, ignoreCase);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index b46de18..ddd3c91 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -78,6 +78,7 @@ org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilterFactory
org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilterFactory
org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
+org.apache.lucene.analysis.miscellaneous.TermExclusionFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 7839203..bef95f8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -71,13 +71,14 @@ import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
+import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
-import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
@@ -207,6 +208,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
continue;
}
+ // conditional filters are tested elsewhere
+ if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
+ continue;
+ }
if (Tokenizer.class.isAssignableFrom(c)) {
assertTrue(ctor.toGenericString() + " has unsupported parameter types",
allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
@@ -696,16 +701,46 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
-
- final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
- if (broken(ctor, args)) {
- continue;
- }
- final TokenFilter flt = createComponent(ctor, args, descr);
- if (flt != null) {
- spec.stream = flt;
+ if (random.nextBoolean()) {
+ long seed = random.nextLong();
+ spec.stream = new ConditionalTokenFilter(spec.stream, in -> {
+ final Object args[] = newFilterArgs(random, in, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ return in;
+ }
+ descr.append("ConditionalTokenFilter: ");
+ TokenStream ts = createComponent(ctor, args, descr);
+ if (ts == null) {
+ return in;
+ }
+ return ts;
+ }) {
+ Random random = new Random(seed);
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ random = new Random(seed);
+ }
+
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return random.nextBoolean();
+ }
+ };
break;
}
+ else {
+ final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
+ if (broken(ctor, args)) {
+ continue;
+ }
+ final TokenFilter flt = createComponent(ctor, args, descr);
+ if (flt != null) {
+ spec.stream = flt;
+ break;
+ }
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index d9ea43c..fb86c07 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -37,6 +37,7 @@ import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
+import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
@@ -500,4 +501,47 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
+ public void testConditions() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("whitespace")
+ .addTokenFilter("lowercase")
+ .whenTerm(t -> t.toString().contains("o"))
+ .addTokenFilter("uppercase")
+ .addTokenFilter(ReverseStringFilterFactory.class)
+ .endwhen()
+ .addTokenFilter("asciifolding")
+ .build();
+
+ assertAnalyzesTo(analyzer, "Héllo world whaT's hãppening",
+ new String[]{ "OLLEH", "DLROW", "what's", "happening" });
+ }
+
+ public void testConditionsWithResourceLoader() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("whitespace")
+ .addTokenFilter("lowercase")
+ .when("termexclusion", "protected", "org/apache/lucene/analysis/custom/teststop.txt")
+ .addTokenFilter("reversestring")
+ .endwhen()
+ .build();
+
+ assertAnalyzesTo(analyzer, "FOO BAR BAZ",
+ new String[]{ "foo", "bar", "zab" });
+ }
+
+ public void testConditionsWithWrappedResourceLoader() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder()
+ .withTokenizer("whitespace")
+ .addTokenFilter("lowercase")
+ .whenTerm(t -> t.toString().contains("o") == false)
+ .addTokenFilter("stop",
+ "ignoreCase", "true",
+ "words", "org/apache/lucene/analysis/custom/teststop.txt",
+ "format", "wordset")
+ .endwhen()
+ .build();
+
+ assertAnalyzesTo(analyzer, "foo bar baz", new String[]{ "foo", "baz" });
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
new file mode 100644
index 0000000..e804676
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestConditionalTokenFilter.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.CharacterUtils;
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SolrSynonymParser;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+public class TestConditionalTokenFilter extends BaseTokenStreamTestCase {
+
+ boolean closed = false;
+ boolean ended = false;
+ boolean reset = false;
+
+ private final class AssertingLowerCaseFilter extends TokenFilter {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public AssertingLowerCaseFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+ return true;
+ } else
+ return false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ ended = true;
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ closed = true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ reset = true;
+ }
+ }
+
+ public void testSimple() throws IOException {
+
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("Alice", 1, 0, 5),
+ new Token("Bob", 1, 6, 9),
+ new Token("Clara", 1, 10, 15),
+ new Token("David", 1, 16, 21)
+ );
+
+ TokenStream t = new ConditionalTokenFilter(cts, AssertingLowerCaseFilter::new) {
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return termAtt.toString().contains("o") == false;
+ }
+ };
+
+ assertTokenStreamContents(t, new String[]{ "alice", "Bob", "clara", "david" });
+ assertTrue(closed);
+ assertTrue(reset);
+ assertTrue(ended);
+ }
+
+ private final class TokenSplitter extends TokenFilter {
+
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ String half;
+
+ protected TokenSplitter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (half == null) {
+ if (input.incrementToken() == false) {
+ return false;
+ }
+ half = termAtt.toString().substring(4);
+ termAtt.setLength(4);
+ return true;
+ }
+ termAtt.setEmpty().append(half);
+ half = null;
+ return true;
+ }
+ }
+
+ public void testMultitokenWrapping() throws IOException {
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("tokenpos1", 0, 9),
+ new Token("tokenpos2", 10, 19),
+ new Token("tokenpos3", 20, 29),
+ new Token("tokenpos4", 30, 39)
+ );
+
+ TokenStream ts = new ConditionalTokenFilter(cts, TokenSplitter::new) {
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return termAtt.toString().contains("2") == false;
+ }
+ };
+
+ assertTokenStreamContents(ts, new String[]{
+ "toke", "npos1", "tokenpos2", "toke", "npos3", "toke", "npos4"
+ });
+ }
+
+ private final class EndTrimmingFilter extends FilteringTokenFilter {
+
+ final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ public EndTrimmingFilter(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ protected boolean accept() throws IOException {
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ offsetAtt.setOffset(0, offsetAtt.endOffset() - 2);
+ }
+ }
+
+ public void testEndPropagation() throws IOException {
+ CannedTokenStream cts1 = new CannedTokenStream(0, 20,
+ new Token("alice", 0, 5), new Token("bob", 6, 8)
+ );
+ TokenStream ts1 = new ConditionalTokenFilter(cts1, EndTrimmingFilter::new) {
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return false;
+ }
+ };
+ assertTokenStreamContents(ts1, new String[]{ "alice", "bob" },
+ null, null, null, null, null, 20);
+
+ CannedTokenStream cts2 = new CannedTokenStream(0, 20,
+ new Token("alice", 0, 5), new Token("bob", 6, 8)
+ );
+ TokenStream ts2 = new ConditionalTokenFilter(cts2, EndTrimmingFilter::new) {
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return true;
+ }
+ };
+ assertTokenStreamContents(ts2, new String[]{ "alice", "bob" },
+ null, null, null, null, null, 18);
+ }
+
+ public void testWrapGraphs() throws Exception {
+
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("a", 0, 1),
+ new Token("b", 2, 3),
+ new Token("c", 4, 5),
+ new Token("d", 6, 7),
+ new Token("e", 8, 9)
+ );
+
+ SynonymMap sm;
+ try (Analyzer analyzer = new MockAnalyzer(random())) {
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
+ parser.parse(new StringReader("a b, f\nc d, g"));
+ sm = parser.build();
+ }
+
+ TokenStream ts = new ConditionalTokenFilter(cts, in -> new SynonymGraphFilter(in, sm, true)) {
+ CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ @Override
+ protected boolean shouldFilter() throws IOException {
+ return "c".equals(termAtt.toString()) == false;
+ }
+ };
+
+ assertTokenStreamContents(ts, new String[]{
+ "f", "a", "b", "c", "d", "e"
+ },
+ null, null, null,
+ new int[]{
+ 1, 0, 1, 1, 1, 1
+ },
+ new int[]{
+ 2, 1, 1, 1, 1, 1
+ });
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1ce3ebad/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java
new file mode 100644
index 0000000..2e6aecf
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTermExclusionFilter.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+
+public class TestTermExclusionFilter extends BaseTokenStreamTestCase {
+
+ public void testExcludeTerms() throws IOException {
+
+ CannedTokenStream cts = new CannedTokenStream(
+ new Token("Alice", 1, 0, 5),
+ new Token("Bob", 1, 6, 9),
+ new Token("Clara", 1, 10, 15),
+ new Token("David", 1, 16, 21)
+ );
+
+ CharArraySet exclusions = new CharArraySet(5, true);
+ exclusions.add("bob");
+
+ TokenStream ts = new TermExclusionFilter(exclusions, cts, LowerCaseFilter::new);
+ assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });
+
+ }
+
+}