You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2017/01/16 00:10:30 UTC
[12/50] [abbrv] lucene-solr:jira/solr-5944: promote this test case to
core
promote this test case to core
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f985fcaa
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f985fcaa
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f985fcaa
Branch: refs/heads/jira/solr-5944
Commit: f985fcaa23cb9ef96ed823e5bf7957049e0d9461
Parents: e64111c
Author: Mike McCandless <mi...@apache.org>
Authored: Sun Jan 8 06:38:37 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Sun Jan 8 06:38:37 2017 -0500
----------------------------------------------------------------------
.../lucene/analysis/TestGraphTokenizers.java | 600 +++++++++++++++++++
.../lucene/analysis/TestGraphTokenizers.java | 600 -------------------
2 files changed, 600 insertions(+), 600 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f985fcaa/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
new file mode 100644
index 0000000..8899dd1
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@@ -0,0 +1,600 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
+public class TestGraphTokenizers extends BaseTokenStreamTestCase {
+
+ // Makes a graph TokenStream from the string; separate
+ // positions with single space, multiple tokens at the same
+ // position with /, and add optional position length with
+ // :. EG "a b c" is a simple chain, "a/x b c" adds 'x'
+ // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
+ // 'x' over a with posLen=3. Tokens are in normal-form!
+ // So, offsets are computed based on the first token at a
+ // given position. NOTE: each token must be a single
+ // character! We assume this when computing offsets...
+
+ // NOTE: all input tokens must be length 1!!! This means
+ // you cannot turn on MockCharFilter when random
+ // testing...
+
+ private static class GraphTokenizer extends Tokenizer {
+ private List<Token> tokens;
+ private int upto;
+ private int inputLength;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ tokens = null;
+ upto = 0;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (tokens == null) {
+ fillTokens();
+ }
+ //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
+ if (upto == tokens.size()) {
+ //System.out.println(" END @ " + tokens.size());
+ return false;
+ }
+ final Token t = tokens.get(upto++);
+ //System.out.println(" return token=" + t);
+ clearAttributes();
+ termAtt.append(t.toString());
+ offsetAtt.setOffset(t.startOffset(), t.endOffset());
+ posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+ posLengthAtt.setPositionLength(t.getPositionLength());
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ // NOTE: somewhat... hackish, but we need this to
+ // satisfy BTSTC:
+ final int lastOffset;
+ if (tokens != null && !tokens.isEmpty()) {
+ lastOffset = tokens.get(tokens.size()-1).endOffset();
+ } else {
+ lastOffset = 0;
+ }
+ offsetAtt.setOffset(correctOffset(lastOffset),
+ correctOffset(inputLength));
+ }
+
+ private void fillTokens() throws IOException {
+ final StringBuilder sb = new StringBuilder();
+ final char[] buffer = new char[256];
+ while (true) {
+ final int count = input.read(buffer);
+ if (count == -1) {
+ break;
+ }
+ sb.append(buffer, 0, count);
+ //System.out.println("got count=" + count);
+ }
+ //System.out.println("fillTokens: " + sb);
+
+ inputLength = sb.length();
+
+ final String[] parts = sb.toString().split(" ");
+
+ tokens = new ArrayList<>();
+ int pos = 0;
+ int maxPos = -1;
+ int offset = 0;
+ //System.out.println("again");
+ for(String part : parts) {
+ final String[] overlapped = part.split("/");
+ boolean firstAtPos = true;
+ int minPosLength = Integer.MAX_VALUE;
+ for(String part2 : overlapped) {
+ final int colonIndex = part2.indexOf(':');
+ final String token;
+ final int posLength;
+ if (colonIndex != -1) {
+ token = part2.substring(0, colonIndex);
+ posLength = Integer.parseInt(part2.substring(1+colonIndex));
+ } else {
+ token = part2;
+ posLength = 1;
+ }
+ maxPos = Math.max(maxPos, pos + posLength);
+ minPosLength = Math.min(minPosLength, posLength);
+ final Token t = new Token(token, offset, offset + 2*posLength - 1);
+ t.setPositionLength(posLength);
+ t.setPositionIncrement(firstAtPos ? 1:0);
+ firstAtPos = false;
+ //System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
+ tokens.add(t);
+ }
+ pos += minPosLength;
+ offset = 2 * pos;
+ }
+ assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
+ }
+ }
+
+ public void testMockGraphTokenFilterBasic() throws Exception {
+
+ for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+ return new TokenStreamComponents(t, t2);
+ }
+ };
+
+ checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
+ }
+ }
+
+ public void testMockGraphTokenFilterOnGraphInput() throws Exception {
+ for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new GraphTokenizer();
+ final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+ return new TokenStreamComponents(t, t2);
+ }
+ };
+
+ checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
+ }
+ }
+
+ // Just deletes (leaving hole) token 'a':
+ private final static class RemoveATokens extends TokenFilter {
+ private int pendingPosInc;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ public RemoveATokens(TokenStream in) {
+ super(in);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ pendingPosInc = 0;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (true) {
+ final boolean gotOne = input.incrementToken();
+ if (!gotOne) {
+ return false;
+ } else if (termAtt.toString().equals("a")) {
+ pendingPosInc += posIncAtt.getPositionIncrement();
+ } else {
+ posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
+ pendingPosInc = 0;
+ return true;
+ }
+ }
+ }
+ }
+
+ public void testMockGraphTokenFilterBeforeHoles() throws Exception {
+ for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+ final TokenStream t3 = new RemoveATokens(t2);
+ return new TokenStreamComponents(t, t3);
+ }
+ };
+
+ Random random = random();
+ checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
+ checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
+ checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
+ checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
+ }
+ }
+
+ public void testMockGraphTokenFilterAfterHoles() throws Exception {
+ for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t2 = new RemoveATokens(t);
+ final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
+ return new TokenStreamComponents(t, t3);
+ }
+ };
+
+ Random random = random();
+ checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
+ checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
+ checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
+ checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
+ }
+ }
+
+ public void testMockGraphTokenFilterRandom() throws Exception {
+ for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+ return new TokenStreamComponents(t, t2);
+ }
+ };
+
+ Random random = random();
+ checkRandomData(random, a, 5, atLeast(100));
+ }
+ }
+
+ // Two MockGraphTokenFilters
+ public void testDoubleMockGraphTokenFilterRandom() throws Exception {
+ for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t1 = new MockGraphTokenFilter(random(), t);
+ final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
+ return new TokenStreamComponents(t, t2);
+ }
+ };
+
+ Random random = random();
+ checkRandomData(random, a, 5, atLeast(100));
+ }
+ }
+
+ public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
+ for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t1 = new MockGraphTokenFilter(random(), t);
+ final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
+ return new TokenStreamComponents(t, t2);
+ }
+ };
+
+ Random random = random();
+ checkRandomData(random, a, 5, atLeast(100));
+ }
+ }
+
+ public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
+ for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter=" + iter);
+ }
+
+ // Make new analyzer each time, because MGTF has fixed
+ // seed:
+ final Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
+ final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
+ return new TokenStreamComponents(t, t2);
+ }
+ };
+
+ Random random = random();
+ checkRandomData(random, a, 5, atLeast(100));
+ }
+ }
+
+ private static Token token(String term, int posInc, int posLength) {
+ final Token t = new Token(term, 0, 0);
+ t.setPositionIncrement(posInc);
+ t.setPositionLength(posLength);
+ return t;
+ }
+
+ private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+ final Token t = new Token(term, startOffset, endOffset);
+ t.setPositionIncrement(posInc);
+ t.setPositionLength(posLength);
+ return t;
+ }
+
+ public void testSingleToken() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ });
+ assertSameLanguage(s2a("abc"), ts);
+ }
+
+ public void testMultipleHoles() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("a", 1, 1),
+ token("b", 3, 1),
+ });
+ assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
+ }
+
+ public void testSynOverMultipleHoles() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("a", 1, 1),
+ token("x", 0, 3),
+ token("b", 3, 1),
+ });
+ final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
+ final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
+ assertSameLanguage(Operations.union(a1, a2), ts);
+ }
+
+ // for debugging!
+ /*
+ private static void toDot(Automaton a) throws IOException {
+ final String s = a.toDot();
+ Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+ w.write(s);
+ w.close();
+ System.out.println("TEST: saved to /x/tmp/out.dot");
+ }
+ */
+
+ private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
+ private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);
+
+ private Automaton join(String ... strings) {
+ List<Automaton> as = new ArrayList<>();
+ for(String s : strings) {
+ as.add(s2a(s));
+ as.add(SEP_A);
+ }
+ as.remove(as.size()-1);
+ return Operations.concatenate(as);
+ }
+
+ private Automaton join(Automaton ... as) {
+ return Operations.concatenate(Arrays.asList(as));
+ }
+
+ private Automaton s2a(String s) {
+ return Automata.makeString(s);
+ }
+
+ public void testTwoTokens() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("def", 1, 1),
+ });
+ assertSameLanguage(join("abc", "def"), ts);
+ }
+
+ public void testHole() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("def", 2, 1),
+ });
+ assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
+ }
+
+ public void testOverlappedTokensSausage() throws Exception {
+
+ // Two tokens on top of each other (sausage):
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 0, 1)
+ });
+ final Automaton a1 = s2a("abc");
+ final Automaton a2 = s2a("xyz");
+ assertSameLanguage(Operations.union(a1, a2), ts);
+ }
+
+ public void testOverlappedTokensLattice() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 0, 2),
+ token("def", 1, 1),
+ });
+ final Automaton a1 = s2a("xyz");
+ final Automaton a2 = join("abc", "def");
+ assertSameLanguage(Operations.union(a1, a2), ts);
+ }
+
+ public void testSynOverHole() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("a", 1, 1),
+ token("X", 0, 2),
+ token("b", 2, 1),
+ });
+ final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
+ final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
+ assertSameLanguage(expected, ts);
+ }
+
+ public void testSynOverHole2() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("xyz", 1, 1),
+ token("abc", 0, 3),
+ token("def", 2, 1),
+ });
+ final Automaton expected = Operations.union(
+ join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
+ assertSameLanguage(expected, ts);
+ }
+
+ public void testOverlappedTokensLattice2() throws Exception {
+
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 0, 3),
+ token("def", 1, 1),
+ token("ghi", 1, 1),
+ });
+ final Automaton a1 = s2a("xyz");
+ final Automaton a2 = join("abc", "def", "ghi");
+ assertSameLanguage(Operations.union(a1, a2), ts);
+ }
+
+ public void testToDot() throws Exception {
+ final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
+ StringWriter w = new StringWriter();
+ new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
+ assertTrue(w.toString().indexOf("abc / abcd") != -1);
+ }
+
+ public void testStartsWithHole() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 2, 1),
+ });
+ assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
+ }
+
+ // TODO: testEndsWithHole... but we need posInc to set in TS.end()
+
+ public void testSynHangingOverEnd() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("a", 1, 1),
+ token("X", 0, 10),
+ });
+ assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
+ }
+
+ private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
+ assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
+ }
+
+ private void assertSameLanguage(Automaton expected, Automaton actual) {
+ assertTrue(Operations.sameLanguage(
+ Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
+ Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
+ }
+
+ public void testTokenStreamGraphWithHoles() throws Exception {
+ final TokenStream ts = new CannedTokenStream(
+ new Token[] {
+ token("abc", 1, 1),
+ token("xyz", 1, 8),
+ token("def", 1, 1),
+ token("ghi", 1, 1),
+ });
+ assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
+ join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f985fcaa/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
deleted file mode 100644
index 8899dd1..0000000
--- a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis;
-
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.Operations;
-
-import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
-
-public class TestGraphTokenizers extends BaseTokenStreamTestCase {
-
- // Makes a graph TokenStream from the string; separate
- // positions with single space, multiple tokens at the same
- // position with /, and add optional position length with
- // :. EG "a b c" is a simple chain, "a/x b c" adds 'x'
- // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
- // 'x' over a with posLen=3. Tokens are in normal-form!
- // So, offsets are computed based on the first token at a
- // given position. NOTE: each token must be a single
- // character! We assume this when computing offsets...
-
- // NOTE: all input tokens must be length 1!!! This means
- // you cannot turn on MockCharFilter when random
- // testing...
-
- private static class GraphTokenizer extends Tokenizer {
- private List<Token> tokens;
- private int upto;
- private int inputLength;
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
-
- @Override
- public void reset() throws IOException {
- super.reset();
- tokens = null;
- upto = 0;
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (tokens == null) {
- fillTokens();
- }
- //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
- if (upto == tokens.size()) {
- //System.out.println(" END @ " + tokens.size());
- return false;
- }
- final Token t = tokens.get(upto++);
- //System.out.println(" return token=" + t);
- clearAttributes();
- termAtt.append(t.toString());
- offsetAtt.setOffset(t.startOffset(), t.endOffset());
- posIncrAtt.setPositionIncrement(t.getPositionIncrement());
- posLengthAtt.setPositionLength(t.getPositionLength());
- return true;
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- // NOTE: somewhat... hackish, but we need this to
- // satisfy BTSTC:
- final int lastOffset;
- if (tokens != null && !tokens.isEmpty()) {
- lastOffset = tokens.get(tokens.size()-1).endOffset();
- } else {
- lastOffset = 0;
- }
- offsetAtt.setOffset(correctOffset(lastOffset),
- correctOffset(inputLength));
- }
-
- private void fillTokens() throws IOException {
- final StringBuilder sb = new StringBuilder();
- final char[] buffer = new char[256];
- while (true) {
- final int count = input.read(buffer);
- if (count == -1) {
- break;
- }
- sb.append(buffer, 0, count);
- //System.out.println("got count=" + count);
- }
- //System.out.println("fillTokens: " + sb);
-
- inputLength = sb.length();
-
- final String[] parts = sb.toString().split(" ");
-
- tokens = new ArrayList<>();
- int pos = 0;
- int maxPos = -1;
- int offset = 0;
- //System.out.println("again");
- for(String part : parts) {
- final String[] overlapped = part.split("/");
- boolean firstAtPos = true;
- int minPosLength = Integer.MAX_VALUE;
- for(String part2 : overlapped) {
- final int colonIndex = part2.indexOf(':');
- final String token;
- final int posLength;
- if (colonIndex != -1) {
- token = part2.substring(0, colonIndex);
- posLength = Integer.parseInt(part2.substring(1+colonIndex));
- } else {
- token = part2;
- posLength = 1;
- }
- maxPos = Math.max(maxPos, pos + posLength);
- minPosLength = Math.min(minPosLength, posLength);
- final Token t = new Token(token, offset, offset + 2*posLength - 1);
- t.setPositionLength(posLength);
- t.setPositionIncrement(firstAtPos ? 1:0);
- firstAtPos = false;
- //System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
- tokens.add(t);
- }
- pos += minPosLength;
- offset = 2 * pos;
- }
- assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
- }
- }
-
- public void testMockGraphTokenFilterBasic() throws Exception {
-
- for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t2 = new MockGraphTokenFilter(random(), t);
- return new TokenStreamComponents(t, t2);
- }
- };
-
- checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
- }
- }
-
- public void testMockGraphTokenFilterOnGraphInput() throws Exception {
- for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new GraphTokenizer();
- final TokenStream t2 = new MockGraphTokenFilter(random(), t);
- return new TokenStreamComponents(t, t2);
- }
- };
-
- checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
- }
- }
-
- // Just deletes (leaving hole) token 'a':
- private final static class RemoveATokens extends TokenFilter {
- private int pendingPosInc;
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-
- public RemoveATokens(TokenStream in) {
- super(in);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- pendingPosInc = 0;
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- while (true) {
- final boolean gotOne = input.incrementToken();
- if (!gotOne) {
- return false;
- } else if (termAtt.toString().equals("a")) {
- pendingPosInc += posIncAtt.getPositionIncrement();
- } else {
- posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
- pendingPosInc = 0;
- return true;
- }
- }
- }
- }
-
- public void testMockGraphTokenFilterBeforeHoles() throws Exception {
- for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t2 = new MockGraphTokenFilter(random(), t);
- final TokenStream t3 = new RemoveATokens(t2);
- return new TokenStreamComponents(t, t3);
- }
- };
-
- Random random = random();
- checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
- checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
- checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
- checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
- }
- }
-
- public void testMockGraphTokenFilterAfterHoles() throws Exception {
- for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t2 = new RemoveATokens(t);
- final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
- return new TokenStreamComponents(t, t3);
- }
- };
-
- Random random = random();
- checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
- checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
- checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
- checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
- }
- }
-
- public void testMockGraphTokenFilterRandom() throws Exception {
- for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t2 = new MockGraphTokenFilter(random(), t);
- return new TokenStreamComponents(t, t2);
- }
- };
-
- Random random = random();
- checkRandomData(random, a, 5, atLeast(100));
- }
- }
-
- // Two MockGraphTokenFilters
- public void testDoubleMockGraphTokenFilterRandom() throws Exception {
- for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t1 = new MockGraphTokenFilter(random(), t);
- final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
- return new TokenStreamComponents(t, t2);
- }
- };
-
- Random random = random();
- checkRandomData(random, a, 5, atLeast(100));
- }
- }
-
- public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
- for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t1 = new MockGraphTokenFilter(random(), t);
- final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
- return new TokenStreamComponents(t, t2);
- }
- };
-
- Random random = random();
- checkRandomData(random, a, 5, atLeast(100));
- }
- }
-
- public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
- for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
- if (VERBOSE) {
- System.out.println("\nTEST: iter=" + iter);
- }
-
- // Make new analyzer each time, because MGTF has fixed
- // seed:
- final Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
- final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
- return new TokenStreamComponents(t, t2);
- }
- };
-
- Random random = random();
- checkRandomData(random, a, 5, atLeast(100));
- }
- }
-
- private static Token token(String term, int posInc, int posLength) {
- final Token t = new Token(term, 0, 0);
- t.setPositionIncrement(posInc);
- t.setPositionLength(posLength);
- return t;
- }
-
- private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
- final Token t = new Token(term, startOffset, endOffset);
- t.setPositionIncrement(posInc);
- t.setPositionLength(posLength);
- return t;
- }
-
- public void testSingleToken() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- });
- assertSameLanguage(s2a("abc"), ts);
- }
-
- public void testMultipleHoles() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("a", 1, 1),
- token("b", 3, 1),
- });
- assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
- }
-
- public void testSynOverMultipleHoles() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("a", 1, 1),
- token("x", 0, 3),
- token("b", 3, 1),
- });
- final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
- final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
- assertSameLanguage(Operations.union(a1, a2), ts);
- }
-
- // for debugging!
- /*
- private static void toDot(Automaton a) throws IOException {
- final String s = a.toDot();
- Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
- w.write(s);
- w.close();
- System.out.println("TEST: saved to /x/tmp/out.dot");
- }
- */
-
- private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
- private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);
-
- private Automaton join(String ... strings) {
- List<Automaton> as = new ArrayList<>();
- for(String s : strings) {
- as.add(s2a(s));
- as.add(SEP_A);
- }
- as.remove(as.size()-1);
- return Operations.concatenate(as);
- }
-
- private Automaton join(Automaton ... as) {
- return Operations.concatenate(Arrays.asList(as));
- }
-
- private Automaton s2a(String s) {
- return Automata.makeString(s);
- }
-
- public void testTwoTokens() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- token("def", 1, 1),
- });
- assertSameLanguage(join("abc", "def"), ts);
- }
-
- public void testHole() throws Exception {
-
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- token("def", 2, 1),
- });
- assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
- }
-
- public void testOverlappedTokensSausage() throws Exception {
-
- // Two tokens on top of each other (sausage):
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- token("xyz", 0, 1)
- });
- final Automaton a1 = s2a("abc");
- final Automaton a2 = s2a("xyz");
- assertSameLanguage(Operations.union(a1, a2), ts);
- }
-
- public void testOverlappedTokensLattice() throws Exception {
-
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- token("xyz", 0, 2),
- token("def", 1, 1),
- });
- final Automaton a1 = s2a("xyz");
- final Automaton a2 = join("abc", "def");
- assertSameLanguage(Operations.union(a1, a2), ts);
- }
-
- public void testSynOverHole() throws Exception {
-
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("a", 1, 1),
- token("X", 0, 2),
- token("b", 2, 1),
- });
- final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
- final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
- assertSameLanguage(expected, ts);
- }
-
- public void testSynOverHole2() throws Exception {
-
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("xyz", 1, 1),
- token("abc", 0, 3),
- token("def", 2, 1),
- });
- final Automaton expected = Operations.union(
- join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
- assertSameLanguage(expected, ts);
- }
-
- public void testOverlappedTokensLattice2() throws Exception {
-
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- token("xyz", 0, 3),
- token("def", 1, 1),
- token("ghi", 1, 1),
- });
- final Automaton a1 = s2a("xyz");
- final Automaton a2 = join("abc", "def", "ghi");
- assertSameLanguage(Operations.union(a1, a2), ts);
- }
-
- public void testToDot() throws Exception {
- final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
- StringWriter w = new StringWriter();
- new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
- assertTrue(w.toString().indexOf("abc / abcd") != -1);
- }
-
- public void testStartsWithHole() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 2, 1),
- });
- assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
- }
-
- // TODO: testEndsWithHole... but we need posInc to set in TS.end()
-
- public void testSynHangingOverEnd() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("a", 1, 1),
- token("X", 0, 10),
- });
- assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
- }
-
- private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
- assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
- }
-
- private void assertSameLanguage(Automaton expected, Automaton actual) {
- assertTrue(Operations.sameLanguage(
- Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
- Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
- }
-
- public void testTokenStreamGraphWithHoles() throws Exception {
- final TokenStream ts = new CannedTokenStream(
- new Token[] {
- token("abc", 1, 1),
- token("xyz", 1, 8),
- token("def", 1, 1),
- token("ghi", 1, 1),
- });
- assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
- join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
- }
-}