You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2017/01/16 00:10:30 UTC

[12/50] [abbrv] lucene-solr:jira/solr-5944: promote this test case to core

promote this test case to core


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/f985fcaa
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/f985fcaa
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/f985fcaa

Branch: refs/heads/jira/solr-5944
Commit: f985fcaa23cb9ef96ed823e5bf7957049e0d9461
Parents: e64111c
Author: Mike McCandless <mi...@apache.org>
Authored: Sun Jan 8 06:38:37 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Sun Jan 8 06:38:37 2017 -0500

----------------------------------------------------------------------
 .../lucene/analysis/TestGraphTokenizers.java    | 600 +++++++++++++++++++
 .../lucene/analysis/TestGraphTokenizers.java    | 600 -------------------
 2 files changed, 600 insertions(+), 600 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f985fcaa/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
new file mode 100644
index 0000000..8899dd1
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
@@ -0,0 +1,600 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+
+import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
+
+public class TestGraphTokenizers extends BaseTokenStreamTestCase {
+
+  // Makes a graph TokenStream from the string; separate
+  // positions with single space, multiple tokens at the same
+  // position with /, and add optional position length with
+  // :.  EG "a b c" is a simple chain, "a/x b c" adds 'x'
+  // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
+  // 'x' over a with posLen=3.  Tokens are in normal-form!
+  // So, offsets are computed based on the first token at a
+  // given position.  NOTE: each token must be a single
+  // character!  We assume this when computing offsets...
+  
+  // NOTE: all input tokens must be length 1!!!  This means
+  // you cannot turn on MockCharFilter when random
+  // testing...
+
+  private static class GraphTokenizer extends Tokenizer {
+    private List<Token> tokens;
+    private int upto;
+    private int inputLength;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+    private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      tokens = null;
+      upto = 0;
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (tokens == null) {
+        fillTokens();
+      }
+      //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
+      if (upto == tokens.size()) {
+        //System.out.println("  END @ " + tokens.size());
+        return false;
+      } 
+      final Token t = tokens.get(upto++);
+      //System.out.println("  return token=" + t);
+      clearAttributes();
+      termAtt.append(t.toString());
+      offsetAtt.setOffset(t.startOffset(), t.endOffset());
+      posIncrAtt.setPositionIncrement(t.getPositionIncrement());
+      posLengthAtt.setPositionLength(t.getPositionLength());
+      return true;
+    }
+
+    @Override
+    public void end() throws IOException {
+      super.end();
+      // NOTE: somewhat... hackish, but we need this to
+      // satisfy BTSTC:
+      final int lastOffset;
+      if (tokens != null && !tokens.isEmpty()) {
+        lastOffset = tokens.get(tokens.size()-1).endOffset();
+      } else {
+        lastOffset = 0;
+      }
+      offsetAtt.setOffset(correctOffset(lastOffset),
+                          correctOffset(inputLength));
+    }
+
+    private void fillTokens() throws IOException {
+      final StringBuilder sb = new StringBuilder();
+      final char[] buffer = new char[256];
+      while (true) {
+        final int count = input.read(buffer);
+        if (count == -1) {
+          break;
+        }
+        sb.append(buffer, 0, count);
+        //System.out.println("got count=" + count);
+      }
+      //System.out.println("fillTokens: " + sb);
+
+      inputLength = sb.length();
+
+      final String[] parts = sb.toString().split(" ");
+
+      tokens = new ArrayList<>();
+      int pos = 0;
+      int maxPos = -1;
+      int offset = 0;
+      //System.out.println("again");
+      for(String part : parts) {
+        final String[] overlapped = part.split("/");
+        boolean firstAtPos = true;
+        int minPosLength = Integer.MAX_VALUE;
+        for(String part2 : overlapped) {
+          final int colonIndex = part2.indexOf(':');
+          final String token;
+          final int posLength;
+          if (colonIndex != -1) {
+            token = part2.substring(0, colonIndex);
+            posLength = Integer.parseInt(part2.substring(1+colonIndex));
+          } else {
+            token = part2;
+            posLength = 1;
+          }
+          maxPos = Math.max(maxPos, pos + posLength);
+          minPosLength = Math.min(minPosLength, posLength);
+          final Token t = new Token(token, offset, offset + 2*posLength - 1);
+          t.setPositionLength(posLength);
+          t.setPositionIncrement(firstAtPos ? 1:0);
+          firstAtPos = false;
+          //System.out.println("  add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
+          tokens.add(t);
+        }
+        pos += minPosLength;
+        offset = 2 * pos;
+      }
+      assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
+    }
+  }
+
+  public void testMockGraphTokenFilterBasic() throws Exception {
+
+    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+            return new TokenStreamComponents(t, t2);
+          }
+        };
+      
+      checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
+    }
+  }
+
+  public void testMockGraphTokenFilterOnGraphInput() throws Exception {
+    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new GraphTokenizer();
+            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+            return new TokenStreamComponents(t, t2);
+          }
+        };
+      
+      checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
+    }
+  }
+
+  // Just deletes (leaving hole) token 'a':
+  private final static class RemoveATokens extends TokenFilter {
+    private int pendingPosInc;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+    public RemoveATokens(TokenStream in) {
+      super(in);
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      pendingPosInc = 0;
+    }
+
+    @Override
+    public void end() throws IOException {
+      super.end();
+      posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      while (true) {
+        final boolean gotOne = input.incrementToken();
+        if (!gotOne) {
+          return false;
+        } else if (termAtt.toString().equals("a")) {
+          pendingPosInc += posIncAtt.getPositionIncrement();
+        } else {
+          posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
+          pendingPosInc = 0;
+          return true;
+        }
+      }
+    }
+  }
+
+  public void testMockGraphTokenFilterBeforeHoles() throws Exception {
+    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+            final TokenStream t3 = new RemoveATokens(t2);
+            return new TokenStreamComponents(t, t3);
+          }
+        };
+
+      Random random = random();
+      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
+      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
+      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
+      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
+    }
+  }
+
+  public void testMockGraphTokenFilterAfterHoles() throws Exception {
+    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t2 = new RemoveATokens(t);
+            final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
+            return new TokenStreamComponents(t, t3);
+          }
+        };
+
+      Random random = random();
+      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
+      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
+      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
+      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
+    }
+  }
+
+  public void testMockGraphTokenFilterRandom() throws Exception {
+    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
+            return new TokenStreamComponents(t, t2);
+          }
+        };
+      
+      Random random = random();
+      checkRandomData(random, a, 5, atLeast(100));
+    }
+  }
+
+  // Two MockGraphTokenFilters
+  public void testDoubleMockGraphTokenFilterRandom() throws Exception {
+    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t1 = new MockGraphTokenFilter(random(), t);
+            final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
+            return new TokenStreamComponents(t, t2);
+          }
+        };
+      
+      Random random = random();
+      checkRandomData(random, a, 5, atLeast(100));
+    }
+  }
+
+  public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
+    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t1 = new MockGraphTokenFilter(random(), t);
+            final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
+            return new TokenStreamComponents(t, t2);
+          }
+        };
+      
+      Random random = random();
+      checkRandomData(random, a, 5, atLeast(100));
+    }
+  }
+
+  public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
+    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + iter);
+      }
+
+      // Make new analyzer each time, because MGTF has fixed
+      // seed:
+      final Analyzer a = new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
+            final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
+            return new TokenStreamComponents(t, t2);
+          }
+        };
+      
+      Random random = random();
+      checkRandomData(random, a, 5, atLeast(100));
+    }
+  }
+
+  private static Token token(String term, int posInc, int posLength) {
+    final Token t = new Token(term, 0, 0);
+    t.setPositionIncrement(posInc);
+    t.setPositionLength(posLength);
+    return t;
+  }
+
+  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+    final Token t = new Token(term, startOffset, endOffset);
+    t.setPositionIncrement(posInc);
+    t.setPositionLength(posLength);
+    return t;
+  }
+
+  public void testSingleToken() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+      });
+    assertSameLanguage(s2a("abc"), ts);
+  }
+
+  public void testMultipleHoles() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("a", 1, 1),
+        token("b", 3, 1),
+      });
+    assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
+  }
+
+  public void testSynOverMultipleHoles() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("a", 1, 1),
+        token("x", 0, 3),
+        token("b", 3, 1),
+      });
+    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
+    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
+    assertSameLanguage(Operations.union(a1, a2), ts);
+  }
+
+  // for debugging!
+  /*
+  private static void toDot(Automaton a) throws IOException {
+    final String s = a.toDot();
+    Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+    w.write(s);
+    w.close();
+    System.out.println("TEST: saved to /x/tmp/out.dot");
+  }
+  */
+
+  private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
+  private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);
+
+  private Automaton join(String ... strings) {
+    List<Automaton> as = new ArrayList<>();
+    for(String s : strings) {
+      as.add(s2a(s));
+      as.add(SEP_A);
+    }
+    as.remove(as.size()-1);
+    return Operations.concatenate(as);
+  }
+
+  private Automaton join(Automaton ... as) {
+    return Operations.concatenate(Arrays.asList(as));
+  }
+
+  private Automaton s2a(String s) {
+    return Automata.makeString(s);
+  }
+
+  public void testTwoTokens() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("def", 1, 1),
+      });
+    assertSameLanguage(join("abc", "def"), ts);
+  }
+
+  public void testHole() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("def", 2, 1),
+      });
+    assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
+  }
+
+  public void testOverlappedTokensSausage() throws Exception {
+
+    // Two tokens on top of each other (sausage):
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 0, 1)
+      });
+    final Automaton a1 = s2a("abc");
+    final Automaton a2 = s2a("xyz");
+    assertSameLanguage(Operations.union(a1, a2), ts);
+  }
+
+  public void testOverlappedTokensLattice() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 0, 2),
+        token("def", 1, 1),
+      });
+    final Automaton a1 = s2a("xyz");
+    final Automaton a2 = join("abc", "def");
+    assertSameLanguage(Operations.union(a1, a2), ts);
+  }
+
+  public void testSynOverHole() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("a", 1, 1),
+        token("X", 0, 2),
+        token("b", 2, 1),
+      });
+    final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
+    final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
+    assertSameLanguage(expected, ts);
+  }
+
+  public void testSynOverHole2() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("xyz", 1, 1),
+        token("abc", 0, 3),
+        token("def", 2, 1),
+      });
+    final Automaton expected = Operations.union(
+      join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
+    assertSameLanguage(expected, ts);
+  }
+
+  public void testOverlappedTokensLattice2() throws Exception {
+
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 0, 3),
+        token("def", 1, 1),
+        token("ghi", 1, 1),
+      });
+    final Automaton a1 = s2a("xyz");
+    final Automaton a2 = join("abc", "def", "ghi");
+    assertSameLanguage(Operations.union(a1, a2), ts);
+  }
+
+  public void testToDot() throws Exception {
+    final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
+    StringWriter w = new StringWriter();
+    new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
+    assertTrue(w.toString().indexOf("abc / abcd") != -1);
+  }
+
+  public void testStartsWithHole() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 2, 1),
+      });
+    assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
+  }
+
+  // TODO: testEndsWithHole... but we need posInc to set in TS.end()
+
+  public void testSynHangingOverEnd() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("a", 1, 1),
+        token("X", 0, 10),
+      });
+    assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
+  }
+
+  private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
+    assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
+  }
+
+  private void assertSameLanguage(Automaton expected, Automaton actual) {
+    assertTrue(Operations.sameLanguage(
+      Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
+      Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
+  }
+
+  public void testTokenStreamGraphWithHoles() throws Exception {
+    final TokenStream ts = new CannedTokenStream(
+      new Token[] {
+        token("abc", 1, 1),
+        token("xyz", 1, 8),
+        token("def", 1, 1),
+        token("ghi", 1, 1),
+      });
+    assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
+                                        join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/f985fcaa/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
deleted file mode 100644
index 8899dd1..0000000
--- a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis;
-
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Random;
-
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.Operations;
-
-import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
-
-public class TestGraphTokenizers extends BaseTokenStreamTestCase {
-
-  // Makes a graph TokenStream from the string; separate
-  // positions with single space, multiple tokens at the same
-  // position with /, and add optional position length with
-  // :.  EG "a b c" is a simple chain, "a/x b c" adds 'x'
-  // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
-  // 'x' over a with posLen=3.  Tokens are in normal-form!
-  // So, offsets are computed based on the first token at a
-  // given position.  NOTE: each token must be a single
-  // character!  We assume this when computing offsets...
-  
-  // NOTE: all input tokens must be length 1!!!  This means
-  // you cannot turn on MockCharFilter when random
-  // testing...
-
-  private static class GraphTokenizer extends Tokenizer {
-    private List<Token> tokens;
-    private int upto;
-    private int inputLength;
-
-    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-    private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
-
-    @Override
-    public void reset() throws IOException {
-      super.reset();
-      tokens = null;
-      upto = 0;
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      if (tokens == null) {
-        fillTokens();
-      }
-      //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
-      if (upto == tokens.size()) {
-        //System.out.println("  END @ " + tokens.size());
-        return false;
-      } 
-      final Token t = tokens.get(upto++);
-      //System.out.println("  return token=" + t);
-      clearAttributes();
-      termAtt.append(t.toString());
-      offsetAtt.setOffset(t.startOffset(), t.endOffset());
-      posIncrAtt.setPositionIncrement(t.getPositionIncrement());
-      posLengthAtt.setPositionLength(t.getPositionLength());
-      return true;
-    }
-
-    @Override
-    public void end() throws IOException {
-      super.end();
-      // NOTE: somewhat... hackish, but we need this to
-      // satisfy BTSTC:
-      final int lastOffset;
-      if (tokens != null && !tokens.isEmpty()) {
-        lastOffset = tokens.get(tokens.size()-1).endOffset();
-      } else {
-        lastOffset = 0;
-      }
-      offsetAtt.setOffset(correctOffset(lastOffset),
-                          correctOffset(inputLength));
-    }
-
-    private void fillTokens() throws IOException {
-      final StringBuilder sb = new StringBuilder();
-      final char[] buffer = new char[256];
-      while (true) {
-        final int count = input.read(buffer);
-        if (count == -1) {
-          break;
-        }
-        sb.append(buffer, 0, count);
-        //System.out.println("got count=" + count);
-      }
-      //System.out.println("fillTokens: " + sb);
-
-      inputLength = sb.length();
-
-      final String[] parts = sb.toString().split(" ");
-
-      tokens = new ArrayList<>();
-      int pos = 0;
-      int maxPos = -1;
-      int offset = 0;
-      //System.out.println("again");
-      for(String part : parts) {
-        final String[] overlapped = part.split("/");
-        boolean firstAtPos = true;
-        int minPosLength = Integer.MAX_VALUE;
-        for(String part2 : overlapped) {
-          final int colonIndex = part2.indexOf(':');
-          final String token;
-          final int posLength;
-          if (colonIndex != -1) {
-            token = part2.substring(0, colonIndex);
-            posLength = Integer.parseInt(part2.substring(1+colonIndex));
-          } else {
-            token = part2;
-            posLength = 1;
-          }
-          maxPos = Math.max(maxPos, pos + posLength);
-          minPosLength = Math.min(minPosLength, posLength);
-          final Token t = new Token(token, offset, offset + 2*posLength - 1);
-          t.setPositionLength(posLength);
-          t.setPositionIncrement(firstAtPos ? 1:0);
-          firstAtPos = false;
-          //System.out.println("  add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
-          tokens.add(t);
-        }
-        pos += minPosLength;
-        offset = 2 * pos;
-      }
-      assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
-    }
-  }
-
-  public void testMockGraphTokenFilterBasic() throws Exception {
-
-    for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
-            return new TokenStreamComponents(t, t2);
-          }
-        };
-      
-      checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
-    }
-  }
-
-  public void testMockGraphTokenFilterOnGraphInput() throws Exception {
-    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new GraphTokenizer();
-            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
-            return new TokenStreamComponents(t, t2);
-          }
-        };
-      
-      checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
-    }
-  }
-
-  // Just deletes (leaving hole) token 'a':
-  private final static class RemoveATokens extends TokenFilter {
-    private int pendingPosInc;
-
-    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-
-    public RemoveATokens(TokenStream in) {
-      super(in);
-    }
-
-    @Override
-    public void reset() throws IOException {
-      super.reset();
-      pendingPosInc = 0;
-    }
-
-    @Override
-    public void end() throws IOException {
-      super.end();
-      posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      while (true) {
-        final boolean gotOne = input.incrementToken();
-        if (!gotOne) {
-          return false;
-        } else if (termAtt.toString().equals("a")) {
-          pendingPosInc += posIncAtt.getPositionIncrement();
-        } else {
-          posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
-          pendingPosInc = 0;
-          return true;
-        }
-      }
-    }
-  }
-
-  public void testMockGraphTokenFilterBeforeHoles() throws Exception {
-    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
-            final TokenStream t3 = new RemoveATokens(t2);
-            return new TokenStreamComponents(t, t3);
-          }
-        };
-
-      Random random = random();
-      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
-      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
-      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
-      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
-    }
-  }
-
-  public void testMockGraphTokenFilterAfterHoles() throws Exception {
-    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t2 = new RemoveATokens(t);
-            final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
-            return new TokenStreamComponents(t, t3);
-          }
-        };
-
-      Random random = random();
-      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
-      checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
-      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
-      checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
-    }
-  }
-
-  public void testMockGraphTokenFilterRandom() throws Exception {
-    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t2 = new MockGraphTokenFilter(random(), t);
-            return new TokenStreamComponents(t, t2);
-          }
-        };
-      
-      Random random = random();
-      checkRandomData(random, a, 5, atLeast(100));
-    }
-  }
-
-  // Two MockGraphTokenFilters
-  public void testDoubleMockGraphTokenFilterRandom() throws Exception {
-    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t1 = new MockGraphTokenFilter(random(), t);
-            final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
-            return new TokenStreamComponents(t, t2);
-          }
-        };
-      
-      Random random = random();
-      checkRandomData(random, a, 5, atLeast(100));
-    }
-  }
-
-  public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
-    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t1 = new MockGraphTokenFilter(random(), t);
-            final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
-            return new TokenStreamComponents(t, t2);
-          }
-        };
-      
-      Random random = random();
-      checkRandomData(random, a, 5, atLeast(100));
-    }
-  }
-
-  public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
-    for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {
-
-      if (VERBOSE) {
-        System.out.println("\nTEST: iter=" + iter);
-      }
-
-      // Make new analyzer each time, because MGTF has fixed
-      // seed:
-      final Analyzer a = new Analyzer() {
-          @Override
-          protected TokenStreamComponents createComponents(String fieldName) {
-            final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
-            final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
-            final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
-            return new TokenStreamComponents(t, t2);
-          }
-        };
-      
-      Random random = random();
-      checkRandomData(random, a, 5, atLeast(100));
-    }
-  }
-
-  private static Token token(String term, int posInc, int posLength) {
-    final Token t = new Token(term, 0, 0);
-    t.setPositionIncrement(posInc);
-    t.setPositionLength(posLength);
-    return t;
-  }
-
-  private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
-    final Token t = new Token(term, startOffset, endOffset);
-    t.setPositionIncrement(posInc);
-    t.setPositionLength(posLength);
-    return t;
-  }
-
-  public void testSingleToken() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-      });
-    assertSameLanguage(s2a("abc"), ts);
-  }
-
-  public void testMultipleHoles() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("a", 1, 1),
-        token("b", 3, 1),
-      });
-    assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
-  }
-
-  public void testSynOverMultipleHoles() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("a", 1, 1),
-        token("x", 0, 3),
-        token("b", 3, 1),
-      });
-    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
-    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
-    assertSameLanguage(Operations.union(a1, a2), ts);
-  }
-
-  // for debugging!
-  /*
-  private static void toDot(Automaton a) throws IOException {
-    final String s = a.toDot();
-    Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
-    w.write(s);
-    w.close();
-    System.out.println("TEST: saved to /x/tmp/out.dot");
-  }
-  */
-
-  private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
-  private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);
-
-  private Automaton join(String ... strings) {
-    List<Automaton> as = new ArrayList<>();
-    for(String s : strings) {
-      as.add(s2a(s));
-      as.add(SEP_A);
-    }
-    as.remove(as.size()-1);
-    return Operations.concatenate(as);
-  }
-
-  private Automaton join(Automaton ... as) {
-    return Operations.concatenate(Arrays.asList(as));
-  }
-
-  private Automaton s2a(String s) {
-    return Automata.makeString(s);
-  }
-
-  public void testTwoTokens() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-        token("def", 1, 1),
-      });
-    assertSameLanguage(join("abc", "def"), ts);
-  }
-
-  public void testHole() throws Exception {
-
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-        token("def", 2, 1),
-      });
-    assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
-  }
-
-  public void testOverlappedTokensSausage() throws Exception {
-
-    // Two tokens on top of each other (sausage):
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-        token("xyz", 0, 1)
-      });
-    final Automaton a1 = s2a("abc");
-    final Automaton a2 = s2a("xyz");
-    assertSameLanguage(Operations.union(a1, a2), ts);
-  }
-
-  public void testOverlappedTokensLattice() throws Exception {
-
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-        token("xyz", 0, 2),
-        token("def", 1, 1),
-      });
-    final Automaton a1 = s2a("xyz");
-    final Automaton a2 = join("abc", "def");
-    assertSameLanguage(Operations.union(a1, a2), ts);
-  }
-
-  public void testSynOverHole() throws Exception {
-
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("a", 1, 1),
-        token("X", 0, 2),
-        token("b", 2, 1),
-      });
-    final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
-    final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
-    assertSameLanguage(expected, ts);
-  }
-
-  public void testSynOverHole2() throws Exception {
-
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("xyz", 1, 1),
-        token("abc", 0, 3),
-        token("def", 2, 1),
-      });
-    final Automaton expected = Operations.union(
-      join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
-    assertSameLanguage(expected, ts);
-  }
-
-  public void testOverlappedTokensLattice2() throws Exception {
-
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-        token("xyz", 0, 3),
-        token("def", 1, 1),
-        token("ghi", 1, 1),
-      });
-    final Automaton a1 = s2a("xyz");
-    final Automaton a2 = join("abc", "def", "ghi");
-    assertSameLanguage(Operations.union(a1, a2), ts);
-  }
-
-  public void testToDot() throws Exception {
-    final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
-    StringWriter w = new StringWriter();
-    new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
-    assertTrue(w.toString().indexOf("abc / abcd") != -1);
-  }
-
-  public void testStartsWithHole() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 2, 1),
-      });
-    assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
-  }
-
-  // TODO: testEndsWithHole... but we need posInc to set in TS.end()
-
-  public void testSynHangingOverEnd() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("a", 1, 1),
-        token("X", 0, 10),
-      });
-    assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
-  }
-
-  private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
-    assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
-  }
-
-  private void assertSameLanguage(Automaton expected, Automaton actual) {
-    assertTrue(Operations.sameLanguage(
-      Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_MAX_DETERMINIZED_STATES),
-      Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_MAX_DETERMINIZED_STATES)));
-  }
-
-  public void testTokenStreamGraphWithHoles() throws Exception {
-    final TokenStream ts = new CannedTokenStream(
-      new Token[] {
-        token("abc", 1, 1),
-        token("xyz", 1, 8),
-        token("def", 1, 1),
-        token("ghi", 1, 1),
-      });
-    assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
-                                        join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
-  }
-}