You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gu...@apache.org on 2021/01/23 23:40:31 UTC
[lucene-solr] branch master updated: LUCENE-9575 Add
PatternTypingFilter to annotate tokens with flags and types (#1995)
This is an automated email from the ASF dual-hosted git repository.
gus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new c087f6f LUCENE-9575 Add PatternTypingFilter to annotate tokens with flags and types (#1995)
c087f6f is described below
commit c087f6f8c0a574c9ad64c6bfeec5d06260433632
Author: Gus Heck <46...@users.noreply.github.com>
AuthorDate: Sat Jan 23 18:40:13 2021 -0500
LUCENE-9575 Add PatternTypingFilter to annotate tokens with flags and types (#1995)
LUCENE-9575 Add PatternTypingFilter
---
.../analysis/pattern/PatternTypingFilter.java | 95 +++++++++++++++++
.../pattern/PatternTypingFilterFactory.java | 118 +++++++++++++++++++++
.../org.apache.lucene.analysis.TokenFilterFactory | 1 +
.../analysis/pattern/TestPatternTypingFilter.java | 77 ++++++++++++++
.../pattern/TestPatternTypingFilterFactory.java | 52 +++++++++
5 files changed, 343 insertions(+)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java
new file mode 100644
index 0000000..fbdbdf9
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilter.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Set a type attribute to a parameterized value when tokens are matched by any of a several regex patterns. The
+ * value set in the type attribute is parameterized with the match groups of the regex used for matching.
+ * In combination with TypeAsSynonymFilter and DropIfFlagged filter this can supply complex synonym patterns
+ * that are protected from subsequent analysis, and optionally drop the original term based on the flag
+ * set in this filter. See {@link PatternTypingFilterFactory} for full documentation.
+ *
+ * @see PatternTypingFilterFactory
+ * @since 8.8.0
+ */
+public class PatternTypingFilter extends TokenFilter {
+
+ private final PatternTypingRule[] replacementAndFlagByPattern;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final FlagsAttribute flagAtt = addAttribute(FlagsAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ public PatternTypingFilter(TokenStream input, PatternTypingRule... replacementAndFlagByPattern) {
+ super(input);
+ this.replacementAndFlagByPattern = replacementAndFlagByPattern;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ for (PatternTypingRule rule : replacementAndFlagByPattern) {
+ Matcher matcher = rule.getPattern().matcher(termAtt);
+ if (matcher.find()) {
+ // allow 2nd reset() and find() that occurs inside replaceFirst to avoid excess string creation
+ typeAtt.setType(matcher.replaceFirst(rule.getTypeTemplate()));
+ flagAtt.setFlags(rule.getFlags());
+ return true;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Value holding class for pattern typing rules.
+ */
+ public static class PatternTypingRule {
+ private final Pattern pattern;
+ private final int flags;
+ private final String typeTemplate;
+
+ public PatternTypingRule(Pattern pattern, int flags, String typeTemplate) {
+ this.pattern = pattern;
+ this.flags = flags;
+ this.typeTemplate = typeTemplate;
+ }
+
+ public Pattern getPattern() {
+ return pattern;
+ }
+
+ public int getFlags() {
+ return flags;
+ }
+
+ public String getTypeTemplate() {
+ return typeTemplate;
+ }
+ }
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilterFactory.java
new file mode 100644
index 0000000..3eb168b
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTypingFilterFactory.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pattern.PatternTypingFilter.PatternTypingRule;
+import org.apache.lucene.util.ResourceLoader;
+import org.apache.lucene.util.ResourceLoaderAware;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+
+/**
+ * Provides a filter that will analyze tokens with the analyzer from an arbitrary field type. By itself this
+ * filter is not very useful. Normally it is combined with a filter that reacts to types or flags.
+ *
+ * <pre class="prettyprint" >
+ * <fieldType name="text_taf" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="com.example.PatternTypingFilter" patternFile="patterns.txt"/>
+ * <filter class="solr.TokenAnalyzerFilter" asType="text_en" preserveType="true"/>
+ * <filter class="solr.TypeAsSynonymFilterFactory" prefix="__TAS__"
+ * ignore="word,&lt;ALPHANUM&gt;,&lt;NUM&gt;,&lt;SOUTHEAST_ASIAN&gt;,&lt;IDEOGRAPHIC&gt;,&lt;HIRAGANA&gt;,&lt;KATAKANA&gt;,&lt;HANGUL&gt;,&lt;EMOJI&gt;"/>
+ * </analyzer>
+ * </fieldType></pre>
+ * <p>
+ * Note that a configuration such as above may interfere with multi-word synonyms. The patterns file has the format:
+ * <pre>
+ * (flags) (pattern) ::: (replacement)
+ * </pre>
+ * Therefore to set the first 2 flag bits on the original token matching 401k or 401(k) and adding a type of
+ * 'legal2_401_k' whenever either one is encountered one would use:
+ * <pre>
+ * 3 (\d+)\(?([a-z])\)? ::: legal2_$1_$2
+ * </pre>
+ * Note that the number indicating the flag bits to set must not have leading spaces and be followed by a single
+ * space, and must be 0 if no flags should be set. The flags number should not contain commas or a decimal point.
+ * Lines for which the first character is <code>#</code> will be ignored as comments. Does not support producing
+ * a synonym textually identical to the original term.
+ *
+ * @lucene.spi {@value #NAME}
+ * @since 8.8
+ */
+public class PatternTypingFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+
+ /**
+ * SPI name
+ */
+ public static final String NAME = "patternTyping";
+
+ private final String patternFile;
+ private PatternTypingRule[] rules;
+
+ /**
+ * Creates a new PatternTypingFilterFactory
+ */
+ public PatternTypingFilterFactory(Map<String, String> args) {
+ super(args);
+ patternFile = require(args, "patternFile");
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /**
+ * Default ctor for compatibility with SPI
+ */
+ public PatternTypingFilterFactory() {
+ throw defaultCtorException();
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) throws IOException {
+ List<PatternTypingRule> ruleList = new ArrayList<>();
+ List<String> lines = getLines(loader, patternFile);
+ // format: # regex ::: typename[_$1[_$2 ...]] (technically _$1 does not need the '_' but it usually makes sense)
+ // eg: 2 (\d+\(?([a-z])\)?\(?(\d+)\)? ::: legal3_$1_$2_3
+ // which yields legal3_501_c_3 for 501(c)(3) or 501c3 and sets the second lowest bit in flags
+ for (String line : lines) {
+ int firstSpace = line.indexOf(" "); // no leading spaces allowed
+ int flagsVal = Integer.parseInt(line.substring(0, firstSpace));
+ line = line.substring(firstSpace + 1);
+ String[] split = line.split(" ::: "); // arbitrary, unlikely to occur in a useful regex easy to read
+ if (split.length != 2) {
+ throw new RuntimeException("The PatternTypingFilter: Always two there are, no more, no less, a pattern and a replacement (separated by ' ::: ' )");
+ }
+ Pattern compiled = Pattern.compile(split[0]);
+ ruleList.add(new PatternTypingRule(compiled, flagsVal, split[1]));
+ }
+ this.rules = ruleList.toArray(new PatternTypingRule[0]);
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new PatternTypingFilter(input, rules);
+ }
+}
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
index be82bf2..ce2fd64 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -97,6 +97,7 @@ org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
+org.apache.lucene.analysis.pattern.PatternTypingFilterFactory
org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilter.java
new file mode 100644
index 0000000..7c206e5
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilter.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pattern.PatternTypingFilter.PatternTypingRule;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+/**
+ * Test that this filter sets a type for tokens matching patterns defined in a patterns.txt file
+ */
+public class TestPatternTypingFilter extends BaseTokenStreamTestCase {
+
+ /**
+ * Test the straight forward cases. When all flags match the token should be dropped
+ */
+ public void testPatterns() throws Exception {
+
+ Token tokenA1 = new Token("One", 0, 2);
+ Token tokenA2 = new Token("401(k)", 4, 9);
+ Token tokenA3 = new Token("two", 11, 13);
+ Token tokenB1 = new Token("three", 15, 19);
+ Token tokenB2 = new Token("401k", 21, 24);
+
+ TokenStream ts = new CannedTokenStream(tokenA1, tokenA2, tokenA3, tokenB1, tokenB2);
+
+ //2 ^(\d+)\(?([a-z])\)?$ ::: legal2_$1_$2
+ ts = new PatternTypingFilter(ts,
+ new PatternTypingRule(Pattern.compile("^(\\d+)\\(?([a-z])\\)?$"),2,"legal2_$1_$2"));
+
+ assertTokenStreamContents(ts, new String[]{
+ "One", "401(k)", "two", "three", "401k"}, null, null,
+ new String[]{"word", "legal2_401_k", "word", "word", "legal2_401_k"},
+ null, null, null, null, null, false, null,
+ new int[]{0, 2, 0, 0, 2});
+ }
+
+ public void testFirstPatternWins() throws IOException {
+ Token tokenA1 = new Token("One", 0, 2);
+ Token tokenA3 = new Token("forty-two", 11, 13);
+ Token tokenB1 = new Token("4-2", 15, 19);
+
+ TokenStream ts = new CannedTokenStream(tokenA1, tokenA3, tokenB1);
+
+ //2 ^(\d+)\(?([a-z])\)?$ ::: legal2_$1_$2
+ PatternTypingRule p1 = new PatternTypingRule(Pattern.compile("^(\\d+)-(\\d+)$"), 6, "$1_hnum_$2");
+ PatternTypingRule p2 = new PatternTypingRule(Pattern.compile("^(\\w+)-(\\w+)$"), 2, "$1_hword_$2");
+
+ ts = new PatternTypingFilter(ts, p1,p2); // 101
+
+ assertTokenStreamContents(ts, new String[]{
+ "One", "forty-two", "4-2"}, null, null,
+ new String[]{"word", "forty_hword_two", "4_hnum_2"},
+ null, null, null, null, null, false, null,
+ new int[]{0, 2, 6});
+ }
+
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java
new file mode 100644
index 0000000..8d5d115
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTypingFilterFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.StringMockResourceLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * This test just ensures the factory works
+ */
+public class TestPatternTypingFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ public void testFactory() throws Exception {
+ Token tokenA1 = new Token("One", 0, 2);
+ Token tokenA3 = new Token("forty-two", 11, 13);
+ Token tokenB1 = new Token("4-2", 15, 19);
+
+ TokenStream ts = new CannedTokenStream(tokenA1, tokenA3, tokenB1);
+
+ TokenFilterFactory tokenFilterFactory = tokenFilterFactory("patternTyping", Version.LATEST, new StringMockResourceLoader(
+ "6 \\b(\\d+)-(\\d+) ::: $1_hnum_$2\n" +
+ "2 \\b(\\w+)-(\\w+) ::: $1_hword_$2"
+ ), "patternFile", "patterns.txt");
+
+ ts = tokenFilterFactory.create(ts);
+ assertTokenStreamContents(ts, new String[]{
+ "One", "forty-two", "4-2"}, null, null,
+ new String[]{"word", "forty_hword_two", "4_hnum_2"},
+ null, null, null, null, null, false, null,
+ new int[]{0, 2, 6});
+ }
+}