You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by no...@apache.org on 2017/01/24 03:33:08 UTC
[12/50] [abbrv] lucene-solr:apiv2: LUCENE-7619: add
WordDelimiterGraphFilter (replacing WordDelimiterFilter) to produce a correct
token stream graph when splitting words
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
new file mode 100644
index 0000000..c69bcca
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.core;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
+
+ private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+ final Token t = new Token(term, startOffset, endOffset);
+ t.setPositionIncrement(posInc);
+ t.setPositionLength(posLength);
+ return t;
+ }
+
+ public void testSimpleMock() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
+ TokenStream ts = new FlattenGraphFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, ts);
+ }
+ };
+
+ assertAnalyzesTo(a, "wtf happened",
+ new String[] {"wtf", "happened"},
+ new int[] { 0, 4},
+ new int[] { 3, 12},
+ null,
+ new int[] { 1, 1},
+ new int[] { 1, 1},
+ true);
+ }
+
+ // Make sure graph is unchanged if it's already flat
+ public void testAlreadyFlatten() throws Exception {
+ TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+ token("wtf", 1, 1, 0, 3),
+ token("what", 0, 1, 0, 3),
+ token("wow", 0, 1, 0, 3),
+ token("the", 1, 1, 0, 3),
+ token("that's", 0, 1, 0, 3),
+ token("fudge", 1, 1, 0, 3),
+ token("funny", 0, 1, 0, 3),
+ token("happened", 1, 1, 4, 12)
+ });
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+ assertTokenStreamContents(out,
+ new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
+ new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+ new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+ new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+ new int[] {1, 1, 1, 1, 1, 1, 1, 1},
+ 12);
+ }
+
+ public void testWTF1() throws Exception {
+
+ // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
+ TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+ token("wtf", 1, 5, 0, 3),
+ token("what", 0, 1, 0, 3),
+ token("wow", 0, 3, 0, 3),
+ token("the", 1, 1, 0, 3),
+ token("fudge", 1, 3, 0, 3),
+ token("that's", 1, 1, 0, 3),
+ token("funny", 1, 1, 0, 3),
+ token("happened", 1, 1, 4, 12)
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+ assertTokenStreamContents(out,
+ new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
+ new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+ new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+ new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+ new int[] {3, 1, 1, 1, 1, 1, 1, 1},
+ 12);
+
+ }
+
+ /** Same as testWTF1 except the "wtf" token comes out later */
+ public void testWTF2() throws Exception {
+
+ // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
+ TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+ token("what", 1, 1, 0, 3),
+ token("wow", 0, 3, 0, 3),
+ token("wtf", 0, 5, 0, 3),
+ token("the", 1, 1, 0, 3),
+ token("fudge", 1, 3, 0, 3),
+ token("that's", 1, 1, 0, 3),
+ token("funny", 1, 1, 0, 3),
+ token("happened", 1, 1, 4, 12)
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+ assertTokenStreamContents(out,
+ new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"},
+ new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+ new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+ new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+ new int[] {1, 1, 3, 1, 1, 1, 1, 1},
+ 12);
+
+ }
+
+ public void testNonGreedySynonyms() throws Exception {
+ // This is just "hypothetical" for Lucene today, because SynFilter is
+ // greedy: when two syn rules match on overlapping tokens, only one
+ // (greedily) wins. This test pretends all syn matches could match:
+
+ TokenStream in = new CannedTokenStream(0, 20, new Token[] {
+ token("wizard", 1, 1, 0, 6),
+ token("wizard_of_oz", 0, 3, 0, 12),
+ token("of", 1, 1, 7, 9),
+ token("oz", 1, 1, 10, 12),
+ token("oz_screams", 0, 2, 10, 20),
+ token("screams", 1, 1, 13, 20),
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+ assertTokenStreamContents(out,
+ new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"},
+ new int[] {0, 0, 7, 10, 10, 13},
+ new int[] {6, 12, 9, 12, 20, 20},
+ new int[] {1, 0, 1, 1, 0, 1},
+ new int[] {1, 3, 1, 1, 2, 1},
+ 20);
+
+ }
+
+ public void testNonGraph() throws Exception {
+ TokenStream in = new CannedTokenStream(0, 22, new Token[] {
+ token("hello", 1, 1, 0, 5),
+ token("pseudo", 1, 1, 6, 12),
+ token("world", 1, 1, 13, 18),
+ token("fun", 1, 1, 19, 22),
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+ assertTokenStreamContents(out,
+ new String[] {"hello", "pseudo", "world", "fun"},
+ new int[] {0, 6, 13, 19},
+ new int[] {5, 12, 18, 22},
+ new int[] {1, 1, 1, 1},
+ new int[] {1, 1, 1, 1},
+ 22);
+ }
+
+ public void testSimpleHole() throws Exception {
+ TokenStream in = new CannedTokenStream(0, 13, new Token[] {
+ token("hello", 1, 1, 0, 5),
+ token("hole", 2, 1, 6, 10),
+ token("fun", 1, 1, 11, 13),
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+ assertTokenStreamContents(out,
+ new String[] {"hello", "hole", "fun"},
+ new int[] {0, 6, 11},
+ new int[] {5, 10, 13},
+ new int[] {1, 2, 1},
+ new int[] {1, 1, 1},
+ 13);
+ }
+
+ public void testHoleUnderSyn() throws Exception {
+ // Tests a StopFilter after SynFilter where a stopword in a syn is removed
+ //
+ // wizard of oz -> woz syn, but then "of" becomes a hole
+
+ TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+ token("wizard", 1, 1, 0, 6),
+ token("woz", 0, 3, 0, 12),
+ token("oz", 2, 1, 10, 12),
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(out,
+ new String[] {"wizard", "woz", "oz"},
+ new int[] {0, 0, 10},
+ new int[] {6, 12, 12},
+ new int[] {1, 0, 2},
+ new int[] {1, 3, 1},
+ 12);
+ }
+
+ public void testStrangelyNumberedNodes() throws Exception {
+
+ // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
+ TokenStream in = new CannedTokenStream(0, 27, new Token[] {
+ token("dog", 1, 3, 0, 5),
+ token("puppy", 0, 3, 0, 5),
+ token("flies", 3, 1, 6, 11),
+ });
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ assertTokenStreamContents(out,
+ new String[] {"dog", "puppy", "flies"},
+ new int[] {0, 0, 6},
+ new int[] {5, 5, 11},
+ new int[] {1, 0, 1},
+ new int[] {1, 1, 1},
+ 27);
+ }
+
+ public void testTwoLongParallelPaths() throws Exception {
+
+ // "a a a a a a" in parallel with "b b b b b b"
+ TokenStream in = new CannedTokenStream(0, 11, new Token[] {
+ token("a", 1, 1, 0, 1),
+ token("b", 0, 2, 0, 1),
+ token("a", 1, 2, 2, 3),
+ token("b", 1, 2, 2, 3),
+ token("a", 1, 2, 4, 5),
+ token("b", 1, 2, 4, 5),
+ token("a", 1, 2, 6, 7),
+ token("b", 1, 2, 6, 7),
+ token("a", 1, 2, 8, 9),
+ token("b", 1, 2, 8, 9),
+ token("a", 1, 2, 10, 11),
+ token("b", 1, 2, 10, 11),
+ });
+
+
+ TokenStream out = new FlattenGraphFilter(in);
+
+ // ... becomes flattened to a single path with overlapping a/b token between each node:
+ assertTokenStreamContents(out,
+ new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"},
+ new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10},
+ new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11},
+ new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+ new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+ 11);
+
+ }
+
+ // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 7f35298..7f0481f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -446,4 +446,73 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
a.close();
}
}
+
+ /*
+ public void testToDot() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;
+ String text = "PowerSystem2000-5-Shot's";
+ WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ //StringWriter sw = new StringWriter();
+ // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw));
+ PrintWriter pw = new PrintWriter("/x/tmp/before.dot");
+ TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw);
+ toDot.toDot();
+ pw.close();
+ System.out.println("TEST DONE");
+ //System.out.println("DOT:\n" + sw.toString());
+ }
+ */
+
+ public void testOnlyNumbers() throws Exception {
+ int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
+ Analyzer a = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "7-586",
+ new String[] {},
+ new int[] {},
+ new int[] {},
+ null,
+ new int[] {},
+ null,
+ false);
+ }
+
+ public void testNumberPunct() throws Exception {
+ int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
+ Analyzer a = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "6-",
+ new String[] {"6"},
+ new int[] {0},
+ new int[] {1},
+ null,
+ new int[] {1},
+ null,
+ false);
+ }
+
+ private Analyzer getAnalyzer(final int flags) {
+ return new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
+ }
+ };
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
new file mode 100644
index 0000000..2daf886
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -0,0 +1,897 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.util.*;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.TestUtil;
+
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.*;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
+
+/**
+ * New WordDelimiterGraphFilter tests... most of the tests are in ConvertedLegacyTest
+ * TODO: should explicitly test things like protWords and not rely on
+ * the factory tests in Solr.
+ */
+public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
+
+ public void testOffsets() throws IOException {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ // test that subwords and catenated subwords have
+ // the correct offsets.
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf,
+ new String[] { "foobar", "foo", "bar" },
+ new int[] { 5, 5, 9 },
+ new int[] { 12, 8, 12 });
+
+ // with illegal offsets:
+ wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ assertTokenStreamContents(wdf,
+ new String[] { "foobar", "foo", "bar" },
+ new int[] { 5, 5, 5 },
+ new int[] { 6, 6, 6 });
+ }
+
+ public void testOffsetChange() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("�belkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf,
+ new String[] { "�belkeit" },
+ new int[] { 7 },
+ new int[] { 15 });
+ }
+
+ public void testOffsetChange2() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(�belkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ // illegal offsets:
+ assertTokenStreamContents(wdf,
+ new String[] { "�belkeit" },
+ new int[] { 7 },
+ new int[] { 17 });
+ }
+
+ public void testOffsetChange3() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(�belkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ assertTokenStreamContents(wdf,
+ new String[] { "�belkeit" },
+ new int[] { 8 },
+ new int[] { 16 });
+ }
+
+ public void testOffsetChange4() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf,
+ new String[] { "foobar", "foo", "bar"},
+ new int[] { 8, 8, 12 },
+ new int[] { 15, 11, 15 });
+ }
+
+ public void doSplit(final String input, String... output) throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input),
+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf, output);
+ }
+
+ public void testSplits() throws Exception {
+ doSplit("basic-split","basic","split");
+ doSplit("camelCase","camel","Case");
+
+ // non-space marking symbol shouldn't cause split
+ // this is an example in Thai
+ doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
+ // possessive followed by delimiter
+ doSplit("test's'", "test");
+
+ // some russian upper and lowercase
+ doSplit("\u0420\u043e\u0431\u0435\u0440\u0442", "\u0420\u043e\u0431\u0435\u0440\u0442");
+ // now cause a split (russian camelCase)
+ doSplit("\u0420\u043e\u0431\u0415\u0440\u0442", "\u0420\u043e\u0431", "\u0415\u0440\u0442");
+
+ // a composed titlecase character, don't split
+ doSplit("a\u01c5ungla", "a\u01c5ungla");
+
+ // a modifier letter, don't split
+ doSplit("\u0633\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0644\u0627\u0645", "\u0633\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0644\u0627\u0645");
+
+ // enclosing mark, don't split
+ doSplit("test\u20dd", "test\u20dd");
+
+ // combining spacing mark (the virama), don't split
+ doSplit("\u0939\u093f\u0928\u094d\u0926\u0940", "\u0939\u093f\u0928\u094d\u0926\u0940");
+
+ // don't split non-ascii digits
+ doSplit("\u0661\u0662\u0663\u0664", "\u0661\u0662\u0663\u0664");
+
+ // don't split supplementaries into unpaired surrogates
+ doSplit("\U00020000\U00020000", "\U00020000\U00020000");
+ }
+
+ public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
+ flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), flags, null);
+
+ assertTokenStreamContents(wdf, output);
+ }
+
+ /*
+ * Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
+ */
+ public void testPossessives() throws Exception {
+ doSplitPossessive(1, "ra's", "ra");
+ doSplitPossessive(0, "ra's", "ra", "s");
+ }
+
+ /*
+ * Set a large position increment gap of 10 if the token is "largegap" or "/"
+ */
+ private final class LargePosIncTokenFilter extends TokenFilter {
+ private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ protected LargePosIncTokenFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (termAtt.toString().equals("largegap") || termAtt.toString().equals("/"))
+ posIncAtt.setPositionIncrement(10);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ public void testPositionIncrements() throws Exception {
+ final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
+ tokenizer,
+ flags, protWords));
+ }
+ };
+
+ /* in this case, works as expected. */
+ assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+ new int[] { 0, 9 },
+ new int[] { 6, 13 },
+ null,
+ new int[] { 1, 2 },
+ null,
+ false);
+
+ /* only in this case, posInc of 2 ?! */
+ assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" },
+ new int[] { 0, 9, 9, 12 },
+ new int[] { 6, 13, 12, 13 },
+ null,
+ new int[] { 1, 2, 0, 1 },
+ null,
+ false);
+
+ assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+ new int[] { 0, 9, 15 },
+ new int[] { 6, 14, 19 },
+ null,
+ new int[] { 1, 2, 1 },
+ null,
+ false);
+
+ /* analyzer that will consume tokens with large position increments */
+ Analyzer a2 = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(
+ new LargePosIncTokenFilter(tokenizer),
+ flags, protWords));
+ }
+ };
+
+ /* increment of "largegap" is preserved */
+ assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
+ new int[] { 0, 7, 16 },
+ new int[] { 6, 15, 20 },
+ null,
+ new int[] { 1, 10, 1 },
+ null,
+ false);
+
+ /* the "/" had a position increment of 10, where did it go?!?!! */
+ assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
+ new int[] { 0, 9 },
+ new int[] { 6, 13 },
+ null,
+ new int[] { 1, 11 },
+ null,
+ false);
+
+ /* in this case, the increment of 10 from the "/" is carried over */
+ assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" },
+ new int[] { 0, 9, 9, 12 },
+ new int[] { 6, 13, 12, 13 },
+ null,
+ new int[] { 1, 11, 0, 1 },
+ null,
+ false);
+
+ assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
+ new int[] { 0, 9, 15 },
+ new int[] { 6, 14, 19 },
+ null,
+ new int[] { 1, 11, 1 },
+ null,
+ false);
+
+ Analyzer a3 = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
+ }
+ };
+
+ assertAnalyzesTo(a3, "lucene.solr",
+ new String[] { "lucenesolr", "lucene", "solr" },
+ new int[] { 0, 0, 7 },
+ new int[] { 11, 6, 11 },
+ null,
+ new int[] { 1, 0, 1 },
+ null,
+ false);
+
+ /* the stopword should add a gap here */
+ assertAnalyzesTo(a3, "the lucene.solr",
+ new String[] { "lucenesolr", "lucene", "solr" },
+ new int[] { 4, 4, 11 },
+ new int[] { 15, 10, 15 },
+ null,
+ new int[] { 2, 0, 1 },
+ null,
+ false);
+
+ IOUtils.close(a, a2, a3);
+ }
+
+ /** concat numbers + words + all */
+ public void testLotsOfConcatenating() throws Exception {
+ final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def-123-456",
+ new String[] { "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" },
+ new int[] { 0, 0, 0, 4, 8, 8, 12 },
+ new int[] { 15, 7, 3, 7, 15, 11, 15 },
+ null,
+ new int[] { 1, 0, 0, 1, 1, 0, 1 },
+ null,
+ false);
+ a.close();
+ }
+
+ /** concat numbers + words + all + preserve original */
+ public void testLotsOfConcatenating2() throws Exception {
+ final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def-123-456",
+ new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
+ new int[] { 0, 0, 0, 0, 4, 8, 8, 12 },
+ new int[] { 15, 15, 7, 3, 7, 15, 11, 15 },
+ null,
+ new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
+ null,
+ false);
+ a.close();
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ int numIterations = atLeast(5);
+ for (int i = 0; i < numIterations; i++) {
+ final int flags = random().nextInt(512);
+ final CharArraySet protectedWords;
+ if (random().nextBoolean()) {
+ protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+ } else {
+ protectedWords = null;
+ }
+
+ Analyzer a = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
+ }
+ };
+ // TODO: properly support positionLengthAttribute
+ checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20, false, false);
+ a.close();
+ }
+ }
+
+ /** blast some enormous random strings through the analyzer */
+ public void testRandomHugeStrings() throws Exception {
+ int numIterations = atLeast(5);
+ for (int i = 0; i < numIterations; i++) {
+ final int flags = random().nextInt(512);
+ final CharArraySet protectedWords;
+ if (random().nextBoolean()) {
+ protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+ } else {
+ protectedWords = null;
+ }
+
+ Analyzer a = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ TokenStream wdgf = new WordDelimiterGraphFilter(tokenizer, flags, protectedWords);
+ return new TokenStreamComponents(tokenizer, wdgf);
+ }
+ };
+ // TODO: properly support positionLengthAttribute
+ checkRandomData(random(), a, 20*RANDOM_MULTIPLIER, 8192, false, false);
+ a.close();
+ }
+ }
+
+ public void testEmptyTerm() throws IOException {
+ Random random = random();
+ for (int i = 0; i < 512; i++) {
+ final int flags = i;
+ final CharArraySet protectedWords;
+ if (random.nextBoolean()) {
+ protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
+ } else {
+ protectedWords = null;
+ }
+
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
+ }
+ };
+ // depending upon options, this thing may or may not preserve the empty term
+ checkAnalysisConsistency(random, a, random.nextBoolean(), "");
+ a.close();
+ }
+ }
+
+ private Analyzer getAnalyzer(int flags) {
+ return getAnalyzer(flags, null);
+ }
+
+ private Analyzer getAnalyzer(int flags, CharArraySet protectedWords) {
+ return new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new KeywordTokenizer();
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
+ }
+ };
+ }
+
+ private static boolean has(int flags, int flag) {
+ return (flags & flag) != 0;
+ }
+
+ private static boolean isEnglishPossessive(String text, int pos) {
+ if (pos > 2) {
+ if ((text.charAt(pos-1) == 's' || text.charAt(pos-1) == 'S') &&
+ (pos == text.length() || text.charAt(pos) != '-')) {
+ text = text.substring(0, text.length()-2);
+ }
+ }
+ return true;
+ }
+
+ private static class WordPart {
+ final String part;
+ final int startOffset;
+ final int endOffset;
+ final int type;
+
+ public WordPart(String text, int startOffset, int endOffset) {
+ this.part = text.substring(startOffset, endOffset);
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ this.type = toType(part.charAt(0));
+ }
+
+ @Override
+ public String toString() {
+ return "WordPart(" + part + " " + startOffset + "-" + endOffset + ")";
+ }
+ }
+
+ private static final int NUMBER = 0;
+ private static final int LETTER = 1;
+ private static final int DELIM = 2;
+
+ private static int toType(char ch) {
+ if (Character.isDigit(ch)) {
+ // numbers
+ return NUMBER;
+ } else if (Character.isLetter(ch)) {
+ // letters
+ return LETTER;
+ } else {
+ // delimiter
+ return DELIM;
+ }
+ }
+
+ /** Does (hopefully) the same thing as WordDelimiterGraphFilter, according to the flags, but more slowly, returning all string paths combinations. */
+ private Set<String> slowWDF(String text, int flags) {
+
+ // first make word parts:
+ List<WordPart> wordParts = new ArrayList<>();
+ int lastCH = -1;
+ int wordPartStart = 0;
+ boolean inToken = false;
+
+ for(int i=0;i<text.length();i++) {
+ char ch = text.charAt(i);
+ if (toType(ch) == DELIM) {
+ // delimiter
+ if (inToken) {
+ // end current token
+ wordParts.add(new WordPart(text, wordPartStart, i));
+ inToken = false;
+ }
+
+ // strip english possessive at the end of this token?:
+ if (has(flags, STEM_ENGLISH_POSSESSIVE) &&
+ ch == '\'' && i > 0 &&
+ i < text.length()-1 &&
+ (text.charAt(i+1) == 's' || text.charAt(i+1) == 'S') &&
+ toType(text.charAt(i-1)) == LETTER &&
+ (i+2 == text.length() || toType(text.charAt(i+2)) == DELIM)) {
+ i += 2;
+ }
+
+ } else if (inToken == false) {
+ // start new token
+ inToken = true;
+ wordPartStart = i;
+ } else {
+ boolean newToken = false;
+ if (Character.isLetter(lastCH)) {
+ if (Character.isLetter(ch)) {
+ if (has(flags, SPLIT_ON_CASE_CHANGE) && Character.isLowerCase(lastCH) && Character.isLowerCase(ch) == false) {
+ // start new token on lower -> UPPER case change (but not vice versa!)
+ newToken = true;
+ }
+ } else if (has(flags, SPLIT_ON_NUMERICS) && Character.isDigit(ch)) {
+ // start new token on letter -> number change
+ newToken = true;
+ }
+ } else {
+ assert Character.isDigit(lastCH);
+ if (Character.isLetter(ch) && has(flags, SPLIT_ON_NUMERICS) ) {
+ // start new token on number -> letter change
+ newToken = true;
+ }
+ }
+ if (newToken) {
+ wordParts.add(new WordPart(text, wordPartStart, i));
+ wordPartStart = i;
+ }
+ }
+ lastCH = ch;
+ }
+
+ if (inToken) {
+ // add last token
+ wordParts.add(new WordPart(text, wordPartStart, text.length()));
+ }
+
+ Set<String> paths = new HashSet<>();
+ if (wordParts.isEmpty() == false) {
+ enumerate(flags, 0, text, wordParts, paths, new StringBuilder());
+ }
+
+ if (has(flags, PRESERVE_ORIGINAL)) {
+ paths.add(text);
+ }
+
+ if (has(flags, CATENATE_ALL) && wordParts.isEmpty() == false) {
+ StringBuilder b = new StringBuilder();
+ for(WordPart wordPart : wordParts) {
+ b.append(wordPart.part);
+ }
+ paths.add(b.toString());
+ }
+
+ return paths;
+ }
+
+ private void add(StringBuilder path, String part) {
+ if (path.length() != 0) {
+ path.append(' ');
+ }
+ path.append(part);
+ }
+
+ private void add(StringBuilder path, List<WordPart> wordParts, int from, int to) {
+ if (path.length() != 0) {
+ path.append(' ');
+ }
+ // no spaces:
+ for(int i=from;i<to;i++) {
+ path.append(wordParts.get(i).part);
+ }
+ }
+
+ private void addWithSpaces(StringBuilder path, List<WordPart> wordParts, int from, int to) {
+ for(int i=from;i<to;i++) {
+ add(path, wordParts.get(i).part);
+ }
+ }
+
+ /** Finds the end (exclusive) of the series of part with the same type */
+ private int endOfRun(List<WordPart> wordParts, int start) {
+ int upto = start+1;
+ while(upto < wordParts.size() && wordParts.get(upto).type == wordParts.get(start).type) {
+ upto++;
+ }
+ return upto;
+ }
+
+ /** Recursively enumerates all paths through the word parts */
+ private void enumerate(int flags, int upto, String text, List<WordPart> wordParts, Set<String> paths, StringBuilder path) {
+ if (upto == wordParts.size()) {
+ if (path.length() > 0) {
+ paths.add(path.toString());
+ }
+ } else {
+ int savLength = path.length();
+ int end = endOfRun(wordParts, upto);
+
+ if (wordParts.get(upto).type == NUMBER) {
+ // always output single word, optionally surrounded by delims:
+ if (has(flags, GENERATE_NUMBER_PARTS) || wordParts.size() == 1) {
+ addWithSpaces(path, wordParts, upto, end);
+ if (has(flags, CATENATE_NUMBERS)) {
+ // recurse first with the parts
+ enumerate(flags, end, text, wordParts, paths, path);
+ path.setLength(savLength);
+ // .. and second with the concat
+ add(path, wordParts, upto, end);
+ }
+ } else if (has(flags, CATENATE_NUMBERS)) {
+ add(path, wordParts, upto, end);
+ }
+ enumerate(flags, end, text, wordParts, paths, path);
+ path.setLength(savLength);
+ } else {
+ assert wordParts.get(upto).type == LETTER;
+ // always output single word, optionally surrounded by delims:
+ if (has(flags, GENERATE_WORD_PARTS) || wordParts.size() == 1) {
+ addWithSpaces(path, wordParts, upto, end);
+ if (has(flags, CATENATE_WORDS)) {
+ // recurse first with the parts
+ enumerate(flags, end, text, wordParts, paths, path);
+ path.setLength(savLength);
+ // .. and second with the concat
+ add(path, wordParts, upto, end);
+ }
+ } else if (has(flags, CATENATE_WORDS)) {
+ add(path, wordParts, upto, end);
+ }
+ enumerate(flags, end, text, wordParts, paths, path);
+ path.setLength(savLength);
+ }
+ }
+ }
+
+ public void testBasicGraphSplits() throws Exception {
+ assertGraphStrings(getAnalyzer(0),
+ "PowerShotPlus",
+ "PowerShotPlus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS),
+ "PowerShotPlus",
+ "PowerShotPlus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+ "PowerShotPlus",
+ "Power Shot Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL),
+ "PowerShotPlus",
+ "PowerShotPlus",
+ "Power Shot Plus");
+
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS),
+ "Power-Shot-Plus",
+ "Power Shot Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+ "Power-Shot-Plus",
+ "Power Shot Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL),
+ "Power-Shot-Plus",
+ "Power-Shot-Plus",
+ "Power Shot Plus");
+
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+ "PowerShotPlus",
+ "Power Shot Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+ "PowerShot1000Plus",
+ "Power Shot1000Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE),
+ "Power-Shot-Plus",
+ "Power Shot Plus");
+
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS),
+ "PowerShotPlus",
+ "Power Shot Plus",
+ "PowerShotPlus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS),
+ "PowerShot1000Plus",
+ "Power Shot1000Plus",
+ "PowerShot1000Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS | CATENATE_NUMBERS),
+ "Power-Shot-1000-17-Plus",
+ "Power Shot 1000 17 Plus",
+ "Power Shot 100017 Plus",
+ "PowerShot 1000 17 Plus",
+ "PowerShot 100017 Plus");
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | CATENATE_WORDS | CATENATE_NUMBERS | PRESERVE_ORIGINAL),
+ "Power-Shot-1000-17-Plus",
+ "Power-Shot-1000-17-Plus",
+ "Power Shot 1000 17 Plus",
+ "Power Shot 100017 Plus",
+ "PowerShot 1000 17 Plus",
+ "PowerShot 100017 Plus");
+ }
+
+ /*
+ public void testToDot() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;
+ String text = "PowerSystem2000-5-Shot's";
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null);
+ //StringWriter sw = new StringWriter();
+ // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw));
+ PrintWriter pw = new PrintWriter("/tmp/foo2.dot");
+ TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw);
+ toDot.toDot();
+ pw.close();
+ //System.out.println("DOT:\n" + sw.toString());
+ }
+ */
+
+ private String randomWDFText() {
+ StringBuilder b = new StringBuilder();
+ int length = TestUtil.nextInt(random(), 1, 50);
+ for(int i=0;i<length;i++) {
+ int surpriseMe = random().nextInt(37);
+ int lower = -1;
+ int upper = -1;
+ if (surpriseMe < 10) {
+ // lowercase letter
+ lower = 'a';
+ upper = 'z';
+ } else if (surpriseMe < 20) {
+ // uppercase letter
+ lower = 'A';
+ upper = 'Z';
+ } else if (surpriseMe < 30) {
+ // digit
+ lower = '0';
+ upper = '9';
+ } else if (surpriseMe < 35) {
+ // punct
+ lower = '-';
+ upper = '-';
+ } else {
+ b.append("'s");
+ }
+
+ if (lower != -1) {
+ b.append((char) TestUtil.nextInt(random(), lower, upper));
+ }
+ }
+
+ return b.toString();
+ }
+
+ public void testInvalidFlag() throws Exception {
+ expectThrows(IllegalArgumentException.class,
+ () -> {
+ new WordDelimiterGraphFilter(new CannedTokenStream(), 1 << 31, null);
+ });
+ }
+
+ public void testRandomPaths() throws Exception {
+ int iters = atLeast(100);
+ for(int iter=0;iter<iters;iter++) {
+ String text = randomWDFText();
+ if (VERBOSE) {
+ System.out.println("\nTEST: text=" + text + " len=" + text.length());
+ }
+
+ int flags = 0;
+ if (random().nextBoolean()) {
+ flags |= GENERATE_WORD_PARTS;
+ }
+ if (random().nextBoolean()) {
+ flags |= GENERATE_NUMBER_PARTS;
+ }
+ if (random().nextBoolean()) {
+ flags |= CATENATE_WORDS;
+ }
+ if (random().nextBoolean()) {
+ flags |= CATENATE_NUMBERS;
+ }
+ if (random().nextBoolean()) {
+ flags |= CATENATE_ALL;
+ }
+ if (random().nextBoolean()) {
+ flags |= PRESERVE_ORIGINAL;
+ }
+ if (random().nextBoolean()) {
+ flags |= SPLIT_ON_CASE_CHANGE;
+ }
+ if (random().nextBoolean()) {
+ flags |= SPLIT_ON_NUMERICS;
+ }
+ if (random().nextBoolean()) {
+ flags |= STEM_ENGLISH_POSSESSIVE;
+ }
+
+ verify(text, flags);
+ }
+ }
+
+ /** Runs normal and slow WDGF and compares results */
+ private void verify(String text, int flags) throws IOException {
+
+ Set<String> expected = slowWDF(text, flags);
+ if (VERBOSE) {
+ for(String path : expected) {
+ System.out.println(" " + path);
+ }
+ }
+
+ Set<String> actual = getGraphStrings(getAnalyzer(flags), text);
+ if (actual.equals(expected) == false) {
+ StringBuilder b = new StringBuilder();
+ b.append("\n\nFAIL: text=");
+ b.append(text);
+ b.append(" flags=");
+ b.append(WordDelimiterGraphFilter.flagsToString(flags));
+ b.append('\n');
+ b.append(" expected paths:\n");
+ for (String s : expected) {
+ b.append(" ");
+ b.append(s);
+ if (actual.contains(s) == false) {
+ b.append(" [missing!]");
+ }
+ b.append('\n');
+ }
+
+ b.append(" actual paths:\n");
+ for (String s : actual) {
+ b.append(" ");
+ b.append(s);
+ if (expected.contains(s) == false) {
+ b.append(" [unexpected!]");
+ }
+ b.append('\n');
+ }
+
+ fail(b.toString());
+ }
+ }
+
+ public void testOnlyNumbers() throws Exception {
+ // no token should be produced
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS), "7-586");
+ }
+
+ public void testNoCatenate() throws Exception {
+ // no token should be produced
+ assertGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS), "a-b-c-9-d", "a b c 9 d");
+ }
+
+ public void testCuriousCase1() throws Exception {
+ verify("u-0L-4836-ip4Gw--13--q7--L07E1", CATENATE_WORDS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE);
+ }
+
+ public void testCuriousCase2() throws Exception {
+ verify("u-l-p", CATENATE_ALL);
+ }
+
+ public void testOriginalPosLength() throws Exception {
+ verify("Foo-Bar-Baz", CATENATE_WORDS | SPLIT_ON_CASE_CHANGE | PRESERVE_ORIGINAL);
+ }
+
+ public void testCuriousCase3() throws Exception {
+ verify("cQzk4-GL0izl0mKM-J8--4m-'s", GENERATE_NUMBER_PARTS | CATENATE_NUMBERS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS);
+ }
+
+ public void testEmptyString() throws Exception {
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
+ wdf.reset();
+ assertTrue(wdf.incrementToken());
+ assertFalse(wdf.incrementToken());
+ wdf.end();
+ wdf.close();
+ }
+
+ public void testProtectedWords() throws Exception {
+ TokenStream tokens = new CannedTokenStream(new Token("foo17-bar", 0, 9),
+ new Token("foo-bar", 0, 7));
+
+ CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
+ assertGraphStrings(wdf,
+ "foo17-bar foo bar",
+ "foo17-bar foo-bar",
+ "foo17-bar foobar");
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java
deleted file mode 100644
index d61fa96..0000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.synonym;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.CannedTokenStream;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-
-public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
-
- private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
- final Token t = new Token(term, startOffset, endOffset);
- t.setPositionIncrement(posInc);
- t.setPositionLength(posLength);
- return t;
- }
-
- public void testSimpleMock() throws Exception {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
- TokenStream ts = new FlattenGraphFilter(tokenizer);
- return new TokenStreamComponents(tokenizer, ts);
- }
- };
-
- assertAnalyzesTo(a, "wtf happened",
- new String[] {"wtf", "happened"},
- new int[] { 0, 4},
- new int[] { 3, 12},
- null,
- new int[] { 1, 1},
- new int[] { 1, 1},
- true);
- }
-
- // Make sure graph is unchanged if it's already flat
- public void testAlreadyFlatten() throws Exception {
- TokenStream in = new CannedTokenStream(0, 12, new Token[] {
- token("wtf", 1, 1, 0, 3),
- token("what", 0, 1, 0, 3),
- token("wow", 0, 1, 0, 3),
- token("the", 1, 1, 0, 3),
- token("that's", 0, 1, 0, 3),
- token("fudge", 1, 1, 0, 3),
- token("funny", 0, 1, 0, 3),
- token("happened", 1, 1, 4, 12)
- });
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
- assertTokenStreamContents(out,
- new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
- new int[] {0, 0, 0, 0, 0, 0, 0, 4},
- new int[] {3, 3, 3, 3, 3, 3, 3, 12},
- new int[] {1, 0, 0, 1, 0, 1, 0, 1},
- new int[] {1, 1, 1, 1, 1, 1, 1, 1},
- 12);
- }
-
- public void testWTF1() throws Exception {
-
- // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
- TokenStream in = new CannedTokenStream(0, 12, new Token[] {
- token("wtf", 1, 5, 0, 3),
- token("what", 0, 1, 0, 3),
- token("wow", 0, 3, 0, 3),
- token("the", 1, 1, 0, 3),
- token("fudge", 1, 3, 0, 3),
- token("that's", 1, 1, 0, 3),
- token("funny", 1, 1, 0, 3),
- token("happened", 1, 1, 4, 12)
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
- assertTokenStreamContents(out,
- new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
- new int[] {0, 0, 0, 0, 0, 0, 0, 4},
- new int[] {3, 3, 3, 3, 3, 3, 3, 12},
- new int[] {1, 0, 0, 1, 0, 1, 0, 1},
- new int[] {3, 1, 1, 1, 1, 1, 1, 1},
- 12);
-
- }
-
- /** Same as testWTF1 except the "wtf" token comes out later */
- public void testWTF2() throws Exception {
-
- // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
- TokenStream in = new CannedTokenStream(0, 12, new Token[] {
- token("what", 1, 1, 0, 3),
- token("wow", 0, 3, 0, 3),
- token("wtf", 0, 5, 0, 3),
- token("the", 1, 1, 0, 3),
- token("fudge", 1, 3, 0, 3),
- token("that's", 1, 1, 0, 3),
- token("funny", 1, 1, 0, 3),
- token("happened", 1, 1, 4, 12)
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
- assertTokenStreamContents(out,
- new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"},
- new int[] {0, 0, 0, 0, 0, 0, 0, 4},
- new int[] {3, 3, 3, 3, 3, 3, 3, 12},
- new int[] {1, 0, 0, 1, 0, 1, 0, 1},
- new int[] {1, 1, 3, 1, 1, 1, 1, 1},
- 12);
-
- }
-
- public void testNonGreedySynonyms() throws Exception {
- // This is just "hypothetical" for Lucene today, because SynFilter is
- // greedy: when two syn rules match on overlapping tokens, only one
- // (greedily) wins. This test pretends all syn matches could match:
-
- TokenStream in = new CannedTokenStream(0, 20, new Token[] {
- token("wizard", 1, 1, 0, 6),
- token("wizard_of_oz", 0, 3, 0, 12),
- token("of", 1, 1, 7, 9),
- token("oz", 1, 1, 10, 12),
- token("oz_screams", 0, 2, 10, 20),
- token("screams", 1, 1, 13, 20),
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
- assertTokenStreamContents(out,
- new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"},
- new int[] {0, 0, 7, 10, 10, 13},
- new int[] {6, 12, 9, 12, 20, 20},
- new int[] {1, 0, 1, 1, 0, 1},
- new int[] {1, 3, 1, 1, 2, 1},
- 20);
-
- }
-
- public void testNonGraph() throws Exception {
- TokenStream in = new CannedTokenStream(0, 22, new Token[] {
- token("hello", 1, 1, 0, 5),
- token("pseudo", 1, 1, 6, 12),
- token("world", 1, 1, 13, 18),
- token("fun", 1, 1, 19, 22),
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
- assertTokenStreamContents(out,
- new String[] {"hello", "pseudo", "world", "fun"},
- new int[] {0, 6, 13, 19},
- new int[] {5, 12, 18, 22},
- new int[] {1, 1, 1, 1},
- new int[] {1, 1, 1, 1},
- 22);
- }
-
- public void testSimpleHole() throws Exception {
- TokenStream in = new CannedTokenStream(0, 13, new Token[] {
- token("hello", 1, 1, 0, 5),
- token("hole", 2, 1, 6, 10),
- token("fun", 1, 1, 11, 13),
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
- assertTokenStreamContents(out,
- new String[] {"hello", "hole", "fun"},
- new int[] {0, 6, 11},
- new int[] {5, 10, 13},
- new int[] {1, 2, 1},
- new int[] {1, 1, 1},
- 13);
- }
-
- public void testHoleUnderSyn() throws Exception {
- // Tests a StopFilter after SynFilter where a stopword in a syn is removed
- //
- // wizard of oz -> woz syn, but then "of" becomes a hole
-
- TokenStream in = new CannedTokenStream(0, 12, new Token[] {
- token("wizard", 1, 1, 0, 6),
- token("woz", 0, 3, 0, 12),
- token("oz", 2, 1, 10, 12),
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- assertTokenStreamContents(out,
- new String[] {"wizard", "woz", "oz"},
- new int[] {0, 0, 10},
- new int[] {6, 12, 12},
- new int[] {1, 0, 2},
- new int[] {1, 3, 1},
- 12);
- }
-
- public void testStrangelyNumberedNodes() throws Exception {
-
- // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
- TokenStream in = new CannedTokenStream(0, 27, new Token[] {
- token("dog", 1, 3, 0, 5),
- token("puppy", 0, 3, 0, 5),
- token("flies", 3, 1, 6, 11),
- });
-
- TokenStream out = new FlattenGraphFilter(in);
-
- assertTokenStreamContents(out,
- new String[] {"dog", "puppy", "flies"},
- new int[] {0, 0, 6},
- new int[] {5, 5, 11},
- new int[] {1, 0, 1},
- new int[] {1, 1, 1},
- 27);
- }
-
- public void testTwoLongParallelPaths() throws Exception {
-
- // "a a a a a a" in parallel with "b b b b b b"
- TokenStream in = new CannedTokenStream(0, 11, new Token[] {
- token("a", 1, 1, 0, 1),
- token("b", 0, 2, 0, 1),
- token("a", 1, 2, 2, 3),
- token("b", 1, 2, 2, 3),
- token("a", 1, 2, 4, 5),
- token("b", 1, 2, 4, 5),
- token("a", 1, 2, 6, 7),
- token("b", 1, 2, 6, 7),
- token("a", 1, 2, 8, 9),
- token("b", 1, 2, 8, 9),
- token("a", 1, 2, 10, 11),
- token("b", 1, 2, 10, 11),
- });
-
-
- TokenStream out = new FlattenGraphFilter(in);
-
- // ... becomes flattened to a single path with overlapping a/b token between each node:
- assertTokenStreamContents(out,
- new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"},
- new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10},
- new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11},
- new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
- new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
- 11);
-
- }
-
- // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
index edf2d2a..e00a165 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java
@@ -17,14 +17,22 @@
package org.apache.lucene.analysis.synonym;
+import java.io.IOException;
+import java.io.StringReader;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -35,7 +43,6 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
@@ -49,15 +56,6 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;
-import java.io.IOException;
-import java.io.StringReader;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
/** Set as a side effect by {@link #getAnalyzer} and {@link #getFlattenAnalyzer}. */
@@ -1832,7 +1830,7 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1},
new int[] {1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1});
- assertAllStrings(analyzer, "the usa is wealthy", new String[] {
+ assertGraphStrings(analyzer, "the usa is wealthy", new String[] {
"the usa is wealthy",
"the united states is wealthy",
"the u s a is wealthy",
@@ -1924,33 +1922,4 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
new int[]{1, 1, 0, 1, 1});
a.close();
}
-
- /**
- * Helper method to validate all strings that can be generated from a token stream.
- * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all
- * and only the given valid strings.
- * @param analyzer analyzer containing the SynonymFilter under test.
- * @param text text to be analyzed.
- * @param expectedStrings all expected finite strings.
- */
- public void assertAllStrings(Analyzer analyzer, String text, String[] expectedStrings) throws IOException {
- TokenStream tokenStream = analyzer.tokenStream("dummy", text);
- try {
- Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
- Set<IntsRef> finiteStrings = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
-
- assertEquals("Invalid resulting strings count. Expected " + expectedStrings.length + " was " + finiteStrings.size(),
- expectedStrings.length, finiteStrings.size());
-
- Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
-
- BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
- for (IntsRef ir: finiteStrings) {
- String s = Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' ');
- assertTrue("Unexpected string found: " + s, expectedStringsSet.contains(s));
- }
- } finally {
- tokenStream.close();
- }
- }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
index 64bac66..0675abe 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@@ -39,6 +39,7 @@ import org.apache.lucene.util.automaton.Automaton;
public class TokenStreamToAutomaton {
private boolean preservePositionIncrements;
+ private boolean finalOffsetGapAsHole;
private boolean unicodeArcs;
/** Sole constructor. */
@@ -51,6 +52,11 @@ public class TokenStreamToAutomaton {
this.preservePositionIncrements = enablePositionIncrements;
}
+ /** If true, any final offset gaps will result in adding a position hole. */
+ public void setFinalOffsetGapAsHole(boolean finalOffsetGapAsHole) {
+ this.finalOffsetGapAsHole = finalOffsetGapAsHole;
+ }
+
/** Whether to make transition labels Unicode code points instead of UTF8 bytes,
* <code>false</code> by default */
public void setUnicodeArcs(boolean unicodeArcs) {
@@ -118,7 +124,7 @@ public class TokenStreamToAutomaton {
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
- if (!preservePositionIncrements && posInc > 1) {
+ if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
@@ -201,10 +207,35 @@ public class TokenStreamToAutomaton {
}
in.end();
+
int endState = -1;
- if (offsetAtt.endOffset() > maxOffset) {
+
+ int endPosInc = posIncAtt.getPositionIncrement();
+
+ if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
+ endPosInc = 1;
+ }
+
+ if (endPosInc > 0) {
+ // there were hole(s) after the last token
endState = builder.createState();
- builder.setAccept(endState, true);
+
+ // add trailing holes now:
+ int lastState = endState;
+ while (true) {
+ int state1 = builder.createState();
+ builder.addTransition(lastState, state1, HOLE);
+ endPosInc--;
+ if (endPosInc == 0) {
+ builder.setAccept(state1, true);
+ break;
+ }
+ int state2 = builder.createState();
+ builder.addTransition(state1, state2, POS_SEP);
+ lastState = state2;
+ }
+ } else {
+ endState = -1;
}
pos++;
@@ -219,7 +250,7 @@ public class TokenStreamToAutomaton {
}
pos++;
}
-
+
return builder.finish();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
index cdc5d42..166d6b2 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
@@ -43,7 +43,7 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
// OffsetAtt
if (startOffset < 0 || endOffset < startOffset) {
- throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+ throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
index c89a374..ad1e232 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
@@ -107,7 +107,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
@Override
public void setOffset(int startOffset, int endOffset) {
if (startOffset < 0 || endOffset < startOffset) {
- throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+ throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
}
this.startOffset = startOffset;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
index 4d63d6f..e89fec1 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
@@ -30,8 +30,7 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
@Override
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0) {
- throw new IllegalArgumentException
- ("Increment must be zero or greater: got " + positionIncrement);
+ throw new IllegalArgumentException("Position increment must be zero or greater; got " + positionIncrement);
}
this.positionIncrement = positionIncrement;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
index 9bfdb49..d019a2b 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
@@ -30,8 +30,7 @@ public class PositionLengthAttributeImpl extends AttributeImpl implements Positi
@Override
public void setPositionLength(int positionLength) {
if (positionLength < 1) {
- throw new IllegalArgumentException
- ("Position length must be 1 or greater: got " + positionLength);
+ throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength);
}
this.positionLength = positionLength;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
index 8899dd1..7e98662 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@@ -21,16 +21,22 @@ import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.HashSet;
import java.util.List;
import java.util.Random;
+import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
@@ -565,7 +571,13 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
}
- // TODO: testEndsWithHole... but we need posInc to set in TS.end()
+ public void testEndsWithHole() throws Exception {
+ final TokenStream ts = new CannedTokenStream(1, 0,
+ new Token[] {
+ token("abc", 2, 1),
+ });
+ assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts);
+ }
public void testSynHangingOverEnd() throws Exception {
final TokenStream ts = new CannedTokenStream(
@@ -576,14 +588,47 @@ public class TestGraphTokenizers extends BaseTokenStreamTestCase {
assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
}
+ /** Returns all paths */
+ private Set<String> toPathStrings(Automaton a) {
+ BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
+ Set<String> paths = new HashSet<>();
+ for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
+ paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
+ }
+ return paths;
+ }
+
private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
}
private void assertSameLanguage(Automaton expected, Automaton actual) {
- assertTrue(Operations.sameLanguage(
- Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
- Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
+ Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES);
+ Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES);
+ if (Operations.sameLanguage(expectedDet, actualDet) == false) {
+ Set<String> expectedPaths = toPathStrings(expectedDet);
+ Set<String> actualPaths = toPathStrings(actualDet);
+ StringBuilder b = new StringBuilder();
+ b.append("expected:\n");
+ for(String path : expectedPaths) {
+ b.append(" ");
+ b.append(path);
+ if (actualPaths.contains(path) == false) {
+ b.append(" [missing!]");
+ }
+ b.append('\n');
+ }
+ b.append("actual:\n");
+ for(String path : actualPaths) {
+ b.append(" ");
+ b.append(path);
+ if (expectedPaths.contains(path) == false) {
+ b.append(" [unexpected!]");
+ }
+ b.append('\n');
+ }
+ fail("accepted language is different:\n\n" + b.toString());
+ }
}
public void testTokenStreamGraphWithHoles() throws Exception {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
index 19982a5..9c6a624 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@@ -332,6 +332,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
TokenStreamToAutomaton getTokenStreamToAutomaton() {
final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
tsta.setPreservePositionIncrements(preservePositionIncrements);
+ tsta.setFinalOffsetGapAsHole(true);
return tsta;
}
@@ -865,7 +866,7 @@ public class AnalyzingSuggester extends Lookup implements Accountable {
// Turn tokenstream into automaton:
Automaton automaton = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
- automaton = getTokenStreamToAutomaton().toAutomaton(ts);
+ automaton = getTokenStreamToAutomaton().toAutomaton(ts);
}
automaton = replaceSep(automaton);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/637915b8/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index 924756e..070eab2 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -41,11 +41,16 @@ import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
+import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
+import org.apache.lucene.util.fst.Util;
/**
* Base class for all Lucene unit tests that use TokenStreams.
@@ -166,6 +171,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
final Map<Integer,Integer> posToStartOffset = new HashMap<>();
final Map<Integer,Integer> posToEndOffset = new HashMap<>();
+ // TODO: would be nice to be able to assert silly duplicated tokens are not created, but a number of cases do this "legitimately": LUCENE-7622
+
ts.reset();
int pos = -1;
int lastStartOffset = 0;
@@ -182,7 +189,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
assertTrue("token "+i+" does not exist", ts.incrementToken());
assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
-
+
assertEquals("term "+i, output[i], termAtt.toString());
if (startOffsets != null) {
assertEquals("startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset());
@@ -261,12 +268,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
if (posLengthAtt != null) {
- assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
+ assertTrue("posLength must be >= 1; got: " + posLengthAtt.getPositionLength(), posLengthAtt.getPositionLength() >= 1);
}
}
if (ts.incrementToken()) {
- fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt);
+ fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + ts.getAttribute(CharTermAttribute.class));
}
// repeat our extra safety checks for end()
@@ -977,4 +984,105 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static AttributeFactory newAttributeFactory() {
return newAttributeFactory(random());
}
+
+ private static String toString(Set<String> strings) {
+ List<String> stringsList = new ArrayList<>(strings);
+ Collections.sort(stringsList);
+ StringBuilder b = new StringBuilder();
+ for(String s : stringsList) {
+ b.append(" ");
+ b.append(s);
+ b.append('\n');
+ }
+ return b.toString();
+ }
+
+ /**
+ * Enumerates all accepted strings in the token graph created by the analyzer on the provided text, and then
+ * asserts that it's equal to the expected strings.
+ * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all
+ * and only the given valid strings.
+ * @param analyzer analyzer containing the SynonymFilter under test.
+ * @param text text to be analyzed.
+ * @param expectedStrings all expected finite strings.
+ */
+ public static void assertGraphStrings(Analyzer analyzer, String text, String... expectedStrings) throws IOException {
+ checkAnalysisConsistency(random(), analyzer, true, text, true);
+ try (TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
+ assertGraphStrings(tokenStream, expectedStrings);
+ }
+ }
+
+ /**
+ * Enumerates all accepted strings in the token graph created by the already initialized {@link TokenStream}.
+ */
+ public static void assertGraphStrings(TokenStream tokenStream, String... expectedStrings) throws IOException {
+ Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
+ Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
+
+ Set<String> expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings));
+
+ BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
+ Set<String> actualStrings = new HashSet<>();
+ for (IntsRef ir: actualStringPaths) {
+ actualStrings.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
+ }
+ for (String s : actualStrings) {
+ assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), expectedStringsSet.contains(s));
+ }
+ for (String s : expectedStrings) {
+ assertTrue("Analyzer created unexpected string path: " + s + "\nexpected:\n" + toString(expectedStringsSet) + "\nactual:\n" + toString(actualStrings), actualStrings.contains(s));
+ }
+ }
+
+ /** Returns all paths accepted by the token stream graph produced by analyzing text with the provided analyzer. The tokens {@link
+ * CharTermAttribute} values are concatenated, and separated with space. */
+ public static Set<String> getGraphStrings(Analyzer analyzer, String text) throws IOException {
+ try(TokenStream tokenStream = analyzer.tokenStream("dummy", text)) {
+ return getGraphStrings(tokenStream);
+ }
+ }
+
+ /** Returns all paths accepted by the token stream graph produced by the already initialized {@link TokenStream}. */
+ public static Set<String> getGraphStrings(TokenStream tokenStream) throws IOException {
+ Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream);
+ Set<IntsRef> actualStringPaths = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1);
+ BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
+ Set<String> paths = new HashSet<>();
+ for (IntsRef ir: actualStringPaths) {
+ paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
+ }
+ return paths;
+ }
+
+ /** Returns a {@code String} summary of the tokens this analyzer produces on this text */
+ public static String toString(Analyzer analyzer, String text) throws IOException {
+ try(TokenStream ts = analyzer.tokenStream("field", text)) {
+ StringBuilder b = new StringBuilder();
+ CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
+ PositionLengthAttribute posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
+ OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
+ assertNotNull(offsetAtt);
+ ts.reset();
+ int pos = -1;
+ while (ts.incrementToken()) {
+ pos += posIncAtt.getPositionIncrement();
+ b.append(termAtt);
+ b.append(" at pos=");
+ b.append(pos);
+ if (posLengthAtt != null) {
+ b.append(" to pos=");
+ b.append(pos + posLengthAtt.getPositionLength());
+ }
+ b.append(" offsets=");
+ b.append(offsetAtt.startOffset());
+ b.append('-');
+ b.append(offsetAtt.endOffset());
+ b.append('\n');
+ }
+ ts.end();
+ return b.toString();
+ }
+ }
}