You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/12 06:30:14 UTC
svn commit: r1531498 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/core/ lucene/core/src/test/org/apache/lucene/analysis/
lucene/test-framework/
lucene/test-framework/src/java/org/apache/lucene/analysis/
Author: rmuir
Date: Sat Oct 12 04:30:13 2013
New Revision: 1531498
URL: http://svn.apache.org/r1531498
Log:
LUCENE-5278: remove CharTokenizer brain-damage from MockTokenizer so it works better with custom regular expressions
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1531498&r1=1531497&r2=1531498&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Sat Oct 12 04:30:13 2013
@@ -125,6 +125,13 @@ Build
* LUCENE-5249, LUCENE-5257: All Lucene/Solr modules should use the same
dependency versions. (Steve Rowe)
+Tests
+
+* LUCENE-5278: Fix MockTokenizer to work better with more regular expression
+ patterns. Previously it could only behave like CharTokenizer (where a character
+ is either a "word" character or not), but now it gives a general longest-match
+ behavior. (Nik Everett via Robert Muir)
+
======================= Lucene 4.5.0 =======================
New features
Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java?rev=1531498&r1=1531497&r2=1531498&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java Sat Oct 12 04:30:13 2013
@@ -8,6 +8,7 @@ import java.util.Random;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@@ -63,6 +64,83 @@ public class TestMockAnalyzer extends Ba
new String[] { "aba4cadaba-Shazam" });
assertAnalyzesTo(a, "break+on/Nothing",
new String[] { "break+on/Nothing" });
+ // currently though emits no tokens for empty string: maybe we can do it,
+ // but we don't want to emit tokens infinitely...
+ assertAnalyzesTo(a, "", new String[0]);
+ }
+
+ // Test some regular expressions as tokenization patterns
+ /** Test a configuration where each character is a term */
+ public void testSingleChar() throws Exception {
+ CharacterRunAutomaton single =
+ new CharacterRunAutomaton(new RegExp(".").toAutomaton());
+ Analyzer a = new MockAnalyzer(random(), single, false);
+ assertAnalyzesTo(a, "foobar",
+ new String[] { "f", "o", "o", "b", "a", "r" },
+ new int[] { 0, 1, 2, 3, 4, 5 },
+ new int[] { 1, 2, 3, 4, 5, 6 }
+ );
+ checkRandomData(random(), a, 100);
+ }
+
+ /** Test a configuration where two characters makes a term */
+ public void testTwoChars() throws Exception {
+ CharacterRunAutomaton single =
+ new CharacterRunAutomaton(new RegExp("..").toAutomaton());
+ Analyzer a = new MockAnalyzer(random(), single, false);
+ assertAnalyzesTo(a, "foobar",
+ new String[] { "fo", "ob", "ar"},
+ new int[] { 0, 2, 4 },
+ new int[] { 2, 4, 6 }
+ );
+ // make sure when last term is a "partial" match that end() is correct
+ assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
+ new String[] { "fo", "ob" },
+ new int[] { 0, 2 },
+ new int[] { 2, 4 },
+ new int[] { 1, 1 },
+ new Integer(5)
+ );
+ checkRandomData(random(), a, 100);
+ }
+
+ /** Test a configuration where three characters makes a term */
+ public void testThreeChars() throws Exception {
+ CharacterRunAutomaton single =
+ new CharacterRunAutomaton(new RegExp("...").toAutomaton());
+ Analyzer a = new MockAnalyzer(random(), single, false);
+ assertAnalyzesTo(a, "foobar",
+ new String[] { "foo", "bar"},
+ new int[] { 0, 3 },
+ new int[] { 3, 6 }
+ );
+ // make sure when last term is a "partial" match that end() is correct
+ assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
+ new String[] { "foo" },
+ new int[] { 0 },
+ new int[] { 3 },
+ new int[] { 1 },
+ new Integer(5)
+ );
+ checkRandomData(random(), a, 100);
+ }
+
+ /** Test a configuration where word starts with one uppercase */
+ public void testUppercase() throws Exception {
+ CharacterRunAutomaton single =
+ new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
+ Analyzer a = new MockAnalyzer(random(), single, false);
+ assertAnalyzesTo(a, "FooBarBAZ",
+ new String[] { "Foo", "Bar", "B", "A", "Z"},
+ new int[] { 0, 3, 6, 7, 8 },
+ new int[] { 3, 6, 7, 8, 9 }
+ );
+ assertAnalyzesTo(a, "aFooBar",
+ new String[] { "Foo", "Bar" },
+ new int[] { 1, 4 },
+ new int[] { 4, 7 }
+ );
+ checkRandomData(random(), a, 100);
}
/** Test a configuration that behaves a lot like StopAnalyzer */
@@ -95,6 +173,29 @@ public class TestMockAnalyzer extends Ba
new int[] { 1, 2 });
}
+ /** Test MockTokenizer encountering a too long token */
+ public void testTooLongToken() throws Exception {
+ Analyzer whitespace = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5);
+ return new TokenStreamComponents(t, t);
+ }
+ };
+
+ assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolong ok "),
+ new String[] { "test", "123", "toolo", "ng", "ok" },
+ new int[] { 0, 5, 9, 14, 17 },
+ new int[] { 4, 8, 14, 16, 19 },
+ new Integer(20));
+
+ assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolo"),
+ new String[] { "test", "123", "toolo" },
+ new int[] { 0, 5, 9 },
+ new int[] { 4, 8, 14 },
+ new Integer(14));
+ }
+
public void testLUCENE_3042() throws Exception {
String testString = "t";
@@ -121,6 +222,25 @@ public class TestMockAnalyzer extends Ba
checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000));
}
+ /** blast some random strings through differently configured tokenizers */
+ public void testRandomRegexps() throws Exception {
+ int iters = atLeast(30);
+ for (int i = 0; i < iters; i++) {
+ final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random()));
+ final boolean lowercase = random().nextBoolean();
+ final int limit = _TestUtil.nextInt(random(), 0, 500);
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
+ return new TokenStreamComponents(t, t);
+ }
+ };
+ checkRandomData(random(), a, 100);
+ a.close();
+ }
+ }
+
public void testForwardOffsets() throws Exception {
int num = atLeast(10000);
for (int i = 0; i < num; i++) {
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java?rev=1531498&r1=1531497&r2=1531498&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java Sat Oct 12 04:30:13 2013
@@ -64,6 +64,11 @@ public class MockTokenizer extends Token
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
int off = 0;
+
+ // buffered state (previous codepoint and offset). we replay this once we
+ // hit a reject state in case its permissible as the start of a new term.
+ int bufferedCodePoint = -1; // -1 indicates empty buffer
+ int bufferedOff = -1;
// TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
// currently, we can only check that the lifecycle is correct if someone is reusing,
@@ -121,8 +126,16 @@ public class MockTokenizer extends Token
: "incrementToken() called while in wrong state: " + streamState;
clearAttributes();
for (;;) {
- int startOffset = off;
- int cp = readCodePoint();
+ int startOffset;
+ int cp;
+ if (bufferedCodePoint >= 0) {
+ cp = bufferedCodePoint;
+ startOffset = bufferedOff;
+ bufferedCodePoint = -1;
+ } else {
+ startOffset = off;
+ cp = readCodePoint();
+ }
if (cp < 0) {
break;
} else if (isTokenChar(cp)) {
@@ -138,6 +151,14 @@ public class MockTokenizer extends Token
cp = readCodePoint();
} while (cp >= 0 && isTokenChar(cp));
+ if (termAtt.length() < maxTokenLength) {
+ // buffer up, in case the "rejected" char can start a new word of its own
+ bufferedCodePoint = cp;
+ bufferedOff = endOffset;
+ } else {
+ // otherwise, its because we hit term limit.
+ bufferedCodePoint = -1;
+ }
int correctedStartOffset = correctOffset(startOffset);
int correctedEndOffset = correctOffset(endOffset);
assert correctedStartOffset >= 0;
@@ -146,8 +167,11 @@ public class MockTokenizer extends Token
lastOffset = correctedStartOffset;
assert correctedEndOffset >= correctedStartOffset;
offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
- streamState = State.INCREMENT;
- return true;
+ if (state == -1 || runAutomaton.isAccept(state)) {
+ // either we hit a reject state (longest match), or end-of-text, but in an accept state
+ streamState = State.INCREMENT;
+ return true;
+ }
}
}
streamState = State.INCREMENT_FALSE;
@@ -203,9 +227,11 @@ public class MockTokenizer extends Token
}
protected boolean isTokenChar(int c) {
- state = runAutomaton.step(state, c);
if (state < 0) {
state = runAutomaton.getInitialState();
+ }
+ state = runAutomaton.step(state, c);
+ if (state < 0) {
return false;
} else {
return true;
@@ -221,6 +247,7 @@ public class MockTokenizer extends Token
super.reset();
state = runAutomaton.getInitialState();
lastOffset = off = 0;
+ bufferedCodePoint = -1;
assert !enableChecks || streamState != State.RESET : "double reset()";
streamState = State.RESET;
}