You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/12 06:30:14 UTC

svn commit: r1531498 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/test/org/apache/lucene/analysis/ lucene/test-framework/ lucene/test-framework/src/java/org/apache/lucene/analysis/

Author: rmuir
Date: Sat Oct 12 04:30:13 2013
New Revision: 1531498

URL: http://svn.apache.org/r1531498
Log:
LUCENE-5278: remove CharTokenizer brain-damage from MockTokenizer so it works better with custom regular expressions

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1531498&r1=1531497&r2=1531498&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sat Oct 12 04:30:13 2013
@@ -125,6 +125,13 @@ Build
 * LUCENE-5249, LUCENE-5257: All Lucene/Solr modules should use the same
   dependency versions. (Steve Rowe)
 
+Tests
+
+* LUCENE-5278: Fix MockTokenizer to work better with more regular expression
+  patterns. Previously it could only behave like CharTokenizer (where a character
+  is either a "word" character or not), but now it gives a general longest-match
+  behavior.  (Nik Everett via Robert Muir)
+
 ======================= Lucene 4.5.0 =======================
 
 New features

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java?rev=1531498&r1=1531497&r2=1531498&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java Sat Oct 12 04:30:13 2013
@@ -8,6 +8,7 @@ import java.util.Random;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.BasicOperations;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@@ -63,6 +64,83 @@ public class TestMockAnalyzer extends Ba
         new String[] { "aba4cadaba-Shazam" });
     assertAnalyzesTo(a, "break+on/Nothing",
         new String[] { "break+on/Nothing" });
+    // currently though emits no tokens for empty string: maybe we can do it,
+    // but we don't want to emit tokens infinitely...
+    assertAnalyzesTo(a, "", new String[0]);
+  }
+  
+  // Test some regular expressions as tokenization patterns
+  /** Test a configuration where each character is a term */
+  public void testSingleChar() throws Exception {
+    CharacterRunAutomaton single =
+        new CharacterRunAutomaton(new RegExp(".").toAutomaton());
+    Analyzer a = new MockAnalyzer(random(), single, false);
+    assertAnalyzesTo(a, "foobar",
+        new String[] { "f", "o", "o", "b", "a", "r" },
+        new int[] { 0, 1, 2, 3, 4, 5 },
+        new int[] { 1, 2, 3, 4, 5, 6 }
+    );
+    checkRandomData(random(), a, 100);
+  }
+  
+  /** Test a configuration where two characters makes a term */
+  public void testTwoChars() throws Exception {
+    CharacterRunAutomaton single =
+        new CharacterRunAutomaton(new RegExp("..").toAutomaton());
+    Analyzer a = new MockAnalyzer(random(), single, false);
+    assertAnalyzesTo(a, "foobar",
+        new String[] { "fo", "ob", "ar"},
+        new int[] { 0, 2, 4 },
+        new int[] { 2, 4, 6 }
+    );
+    // make sure when last term is a "partial" match that end() is correct
+    assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
+        new String[] { "fo", "ob" },
+        new int[] { 0, 2 },
+        new int[] { 2, 4 },
+        new int[] { 1, 1 },
+        new Integer(5)
+    );
+    checkRandomData(random(), a, 100);
+  }
+  
+  /** Test a configuration where three characters makes a term */
+  public void testThreeChars() throws Exception {
+    CharacterRunAutomaton single =
+        new CharacterRunAutomaton(new RegExp("...").toAutomaton());
+    Analyzer a = new MockAnalyzer(random(), single, false);
+    assertAnalyzesTo(a, "foobar",
+        new String[] { "foo", "bar"},
+        new int[] { 0, 3 },
+        new int[] { 3, 6 }
+    );
+    // make sure when last term is a "partial" match that end() is correct
+    assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
+        new String[] { "foo" },
+        new int[] { 0 },
+        new int[] { 3 },
+        new int[] { 1 },
+        new Integer(5)
+    );
+    checkRandomData(random(), a, 100);
+  }
+  
+  /** Test a configuration where word starts with one uppercase */
+  public void testUppercase() throws Exception {
+    CharacterRunAutomaton single =
+        new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
+    Analyzer a = new MockAnalyzer(random(), single, false);
+    assertAnalyzesTo(a, "FooBarBAZ",
+        new String[] { "Foo", "Bar", "B", "A", "Z"},
+        new int[] { 0, 3, 6, 7, 8 },
+        new int[] { 3, 6, 7, 8, 9 }
+    );
+    assertAnalyzesTo(a, "aFooBar",
+        new String[] { "Foo", "Bar" },
+        new int[] { 1, 4 },
+        new int[] { 4, 7 }
+    );
+    checkRandomData(random(), a, 100);
   }
   
   /** Test a configuration that behaves a lot like StopAnalyzer */
@@ -95,6 +173,29 @@ public class TestMockAnalyzer extends Ba
         new int[] { 1, 2 });
   }
   
+  /** Test MockTokenizer encountering a too long token */
+  public void testTooLongToken() throws Exception {
+    Analyzer whitespace = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5);
+        return new TokenStreamComponents(t, t);
+      }
+    };
+    
+    assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolong ok "),
+        new String[] { "test", "123", "toolo", "ng", "ok" },
+        new int[] { 0, 5, 9, 14, 17 },
+        new int[] { 4, 8, 14, 16, 19 },
+        new Integer(20));
+    
+    assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolo"),
+        new String[] { "test", "123", "toolo" },
+        new int[] { 0, 5, 9 },
+        new int[] { 4, 8, 14 },
+        new Integer(14));
+  }
+  
   public void testLUCENE_3042() throws Exception {
     String testString = "t";
     
@@ -121,6 +222,25 @@ public class TestMockAnalyzer extends Ba
     checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000));
   }
   
+  /** blast some random strings through differently configured tokenizers */
+  public void testRandomRegexps() throws Exception {
+    int iters = atLeast(30);
+    for (int i = 0; i < iters; i++) {
+      final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random()));
+      final boolean lowercase = random().nextBoolean();
+      final int limit = _TestUtil.nextInt(random(), 0, 500);
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
+          return new TokenStreamComponents(t, t);
+        }
+      };
+      checkRandomData(random(), a, 100);
+      a.close();
+    }
+  }
+  
   public void testForwardOffsets() throws Exception {
     int num = atLeast(10000);
     for (int i = 0; i < num; i++) {

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java?rev=1531498&r1=1531497&r2=1531498&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java Sat Oct 12 04:30:13 2013
@@ -64,6 +64,11 @@ public class MockTokenizer extends Token
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   int off = 0;
+  
+  // buffered state (previous codepoint and offset). we replay this once we
+  // hit a reject state in case its permissible as the start of a new term.
+  int bufferedCodePoint = -1; // -1 indicates empty buffer
+  int bufferedOff = -1;
 
   // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
   // currently, we can only check that the lifecycle is correct if someone is reusing,
@@ -121,8 +126,16 @@ public class MockTokenizer extends Token
                             : "incrementToken() called while in wrong state: " + streamState;
     clearAttributes();
     for (;;) {
-      int startOffset = off;
-      int cp = readCodePoint();
+      int startOffset;
+      int cp;
+      if (bufferedCodePoint >= 0) {
+        cp = bufferedCodePoint;
+        startOffset = bufferedOff;
+        bufferedCodePoint = -1;
+      } else {
+        startOffset = off;
+        cp = readCodePoint();
+      }
       if (cp < 0) {
         break;
       } else if (isTokenChar(cp)) {
@@ -138,6 +151,14 @@ public class MockTokenizer extends Token
           cp = readCodePoint();
         } while (cp >= 0 && isTokenChar(cp));
         
+        if (termAtt.length() < maxTokenLength) {
+          // buffer up, in case the "rejected" char can start a new word of its own
+          bufferedCodePoint = cp;
+          bufferedOff = endOffset;
+        } else {
+          // otherwise, its because we hit term limit.
+          bufferedCodePoint = -1;
+        }
         int correctedStartOffset = correctOffset(startOffset);
         int correctedEndOffset = correctOffset(endOffset);
         assert correctedStartOffset >= 0;
@@ -146,8 +167,11 @@ public class MockTokenizer extends Token
         lastOffset = correctedStartOffset;
         assert correctedEndOffset >= correctedStartOffset;
         offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
-        streamState = State.INCREMENT;
-        return true;
+        if (state == -1 || runAutomaton.isAccept(state)) {
+          // either we hit a reject state (longest match), or end-of-text, but in an accept state
+          streamState = State.INCREMENT;
+          return true;
+        }
       }
     }
     streamState = State.INCREMENT_FALSE;
@@ -203,9 +227,11 @@ public class MockTokenizer extends Token
   }
 
   protected boolean isTokenChar(int c) {
-    state = runAutomaton.step(state, c);
     if (state < 0) {
       state = runAutomaton.getInitialState();
+    }
+    state = runAutomaton.step(state, c);
+    if (state < 0) {
       return false;
     } else {
       return true;
@@ -221,6 +247,7 @@ public class MockTokenizer extends Token
     super.reset();
     state = runAutomaton.getInitialState();
     lastOffset = off = 0;
+    bufferedCodePoint = -1;
     assert !enableChecks || streamState != State.RESET : "double reset()";
     streamState = State.RESET;
   }