You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/06/28 16:18:14 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9963 Add tests for
alternate path failures in FlattenGraphFilter (#146)
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 5669f2b LUCENE-9963 Add tests for alternate path failures in FlattenGraphFilter (#146)
5669f2b is described below
commit 5669f2ba3479fd7dd15105422368f95a45af4ba4
Author: Geoffrey Lawson <ge...@gmail.com>
AuthorDate: Tue Jun 29 01:04:04 2021 +0900
LUCENE-9963 Add tests for alternate path failures in FlattenGraphFilter (#146)
Co-authored-by: Lawson <ge...@amazon.com>
---
.../analysis/core/TestFlattenGraphFilter.java | 233 +++++++++++++++++++++
1 file changed, 233 insertions(+)
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
index c69bcca..96c940c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -17,13 +17,22 @@
package org.apache.lucene.analysis.core;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.CharsRefBuilder;
public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
@@ -280,5 +289,229 @@ public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
}
+ // The end node the long path is supposed to flatten over doesn't exist
+ // assert disabled = pos length of abc = 4
+ // assert enabled = AssertionError: outputEndNode=3 vs inputTo=2
+ @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+ public void testAltPathFirstStepHole() throws Exception {
+ TokenStream in =
+ new CannedTokenStream(
+ 0,
+ 3,
+ new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("c", 1, 1, 2, 3)});
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(
+ out,
+ new String[] {"abc", "b", "c"},
+ new int[] {0, 1, 2},
+ new int[] {3, 2, 3},
+ new int[] {1, 1, 1},
+ new int[] {3, 1, 1},
+ 3);
+ }
+
+ // Last node in an alt path releases the long path. but it doesn't exist in this graph
+ // pos length of abc = 1
+ @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+ public void testAltPathLastStepHole() throws Exception {
+ TokenStream in =
+ new CannedTokenStream(
+ 0,
+ 4,
+ new Token[] {
+ token("abc", 1, 3, 0, 3),
+ token("a", 0, 1, 0, 1),
+ token("b", 1, 1, 1, 2),
+ token("d", 2, 1, 3, 4)
+ });
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(
+ out,
+ new String[] {"abc", "a", "b", "d"},
+ new int[] {0, 0, 1, 3},
+ new int[] {1, 1, 2, 4},
+ new int[] {1, 0, 1, 2},
+ new int[] {3, 1, 1, 1},
+ 4);
+ }
+
+ // Posinc >2 gets squashed to 2
+ public void testLongHole() throws Exception {
+ TokenStream in =
+ new CannedTokenStream(
+ 0,
+ 28,
+ new Token[] {
+ token("hello", 1, 1, 0, 5), token("hole", 5, 1, 20, 24), token("fun", 1, 1, 25, 28),
+ });
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(
+ out,
+ new String[] {"hello", "hole", "fun"},
+ new int[] {0, 20, 25},
+ new int[] {5, 24, 28},
+ new int[] {1, 2, 1},
+ new int[] {1, 1, 1},
+ 28);
+ }
+
+ // multiple nodes missing in the alt path. Last edge shows up after long edge and short edge,
+ // which looks good but the output graph isn't flat.
+ // assert disabled = nothing
+ // assert enabled = AssertionError
+ @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+ public void testAltPathLastStepLongHole() throws Exception {
+ TokenStream in =
+ new CannedTokenStream(
+ 0,
+ 4,
+ new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("d", 3, 1, 3, 4)});
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(
+ out,
+ new String[] {"abc", "a", "d"},
+ new int[] {0, 0, 3},
+ new int[] {1, 1, 4},
+ new int[] {1, 0, 1},
+ new int[] {1, 1, 1},
+ 4);
+ }
+
+ // LUCENE-8723
+ // Token stream ends without last node showing up
+ // assert disabled = dropped token
+ // assert enabled = AssertionError: 2
+ @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+ public void testAltPathLastStepHoleWithoutEndToken() throws Exception {
+ TokenStream in =
+ new CannedTokenStream(
+ 0,
+ 2,
+ new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("b", 1, 1, 1, 2)});
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(
+ out,
+ new String[] {"abc", "a", "b"},
+ new int[] {0, 0, 1},
+ new int[] {1, 1, 2},
+ new int[] {1, 0, 1},
+ new int[] {1, 1, 1},
+ 2);
+ }
+
+ /**
+ * build CharsRef containing 2-4 tokens
+ *
+ * @param tokens vocabulary of tokens
+ * @param charsRefBuilder CharsRefBuilder
+ * @param random Random for selecting tokens
+ * @return Charsref containing 2-4 tokens.
+ */
+ private CharsRef buildMultiTokenCharsRef(
+ String[] tokens, CharsRefBuilder charsRefBuilder, Random random) {
+ int srcLen = random.nextInt(2) + 2;
+ String[] srcTokens = new String[srcLen];
+ for (int pos = 0; pos < srcLen; pos++) {
+ srcTokens[pos] = tokens[random().nextInt(tokens.length)];
+ }
+ SynonymMap.Builder.join(srcTokens, charsRefBuilder);
+ return charsRefBuilder.toCharsRef();
+ }
+
+ // Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter
+ // Is there some way we can do this and validate output nodes?
+ @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9963")
+ public void testRandomGraphs() throws Exception {
+ String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"};
+ String[] synTokens = new String[] {"s1", "s2", "s3", "s4"};
+
+ SynonymMap.Builder mapBuilder = new SynonymMap.Builder();
+ CharsRefBuilder charRefBuilder = new CharsRefBuilder();
+ Random random = random();
+
+ // between 20 and 20 synonym entries
+ int synCount = random.nextInt(10) + 10;
+ for (int i = 0; i < synCount; i++) {
+ int type = random.nextInt(4);
+ CharsRef src;
+ CharsRef dest;
+ switch (type) {
+ case 0:
+ // 1:1
+ src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
+ charRefBuilder.clear();
+ dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
+ charRefBuilder.clear();
+ break;
+ case 1:
+ // many:1
+ src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
+ charRefBuilder.clear();
+ dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
+ charRefBuilder.clear();
+ break;
+ case 2:
+ // 1:many
+ src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
+ charRefBuilder.clear();
+ dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
+ charRefBuilder.clear();
+ break;
+ default:
+ // many:many
+ src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
+ charRefBuilder.clear();
+ dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
+ charRefBuilder.clear();
+ }
+ mapBuilder.add(src, dest, true);
+ }
+
+ SynonymMap synMap = mapBuilder.build();
+
+ int stopWordCount = random.nextInt(4) + 1;
+ CharArraySet stopWords = new CharArraySet(stopWordCount, true);
+ while (stopWords.size() < stopWordCount) {
+ int index = random.nextInt(baseTokens.length + synTokens.length);
+ String[] tokenArray = baseTokens;
+ if (index >= baseTokens.length) {
+ index -= baseTokens.length;
+ tokenArray = synTokens;
+ }
+ stopWords.add(tokenArray[index]);
+ }
+
+ Analyzer a =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer in = new WhitespaceTokenizer();
+ TokenStream result = new SynonymGraphFilter(in, synMap, true);
+ result = new StopFilter(result, stopWords);
+ result = new FlattenGraphFilter(result);
+ return new TokenStreamComponents(in, result);
+ }
+ };
+
+ int tokenCount = random.nextInt(20) + 20;
+ List<String> stringTokens = new ArrayList<>();
+ while (stringTokens.size() < tokenCount) {
+ stringTokens.add(baseTokens[random.nextInt(baseTokens.length)]);
+ }
+
+ String text = String.join(" ", stringTokens);
+ checkAnalysisConsistency(random, a, false, text);
+ }
+
// NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
}