You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/03/23 18:56:24 UTC
svn commit: r1304528 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/
lucene/contrib/facet/ lucene/cont...
Author: mikemccand
Date: Fri Mar 23 17:56:23 2012
New Revision: 1304528
URL: http://svn.apache.org/viewvc?rev=1304528&view=rev
Log:
LUCENE-3905: sometimes run real-ish content (from LineFileDocs) through the analyzers too; fix end() offset bugs in the ngram tokenizers/filters
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
lucene/dev/branches/branch_3x/lucene/contrib/facet/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/spellchecker/ (props changed)
lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/core/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Fri Mar 23 17:56:23 2012
@@ -73,7 +73,7 @@ public final class EdgeNGramTokenizer ex
private int maxGram;
private int gramSize;
private Side side;
- private boolean started = false;
+ private boolean started;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
@@ -178,7 +178,7 @@ public final class EdgeNGramTokenizer ex
/** Returns the next token in the stream, or null at EOS. */
@Override
- public final boolean incrementToken() throws IOException {
+ public boolean incrementToken() throws IOException {
clearAttributes();
// if we are just starting, read the whole input
if (!started) {
@@ -188,13 +188,28 @@ public final class EdgeNGramTokenizer ex
charsRead = 0;
// TODO: refactor to a shared readFully somewhere:
while (charsRead < chars.length) {
- int inc = input.read(chars, charsRead, chars.length-charsRead);
+ final int inc = input.read(chars, charsRead, chars.length-charsRead);
if (inc == -1) {
break;
}
charsRead += inc;
}
+
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
+
+ if (charsRead == chars.length) {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ char[] throwaway = new char[1024];
+ while(true) {
+ final int inc = input.read(throwaway, 0, throwaway.length);
+ if (inc == -1) {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
inLen = inStr.length();
if (inLen == 0) {
return false;
@@ -221,21 +236,15 @@ public final class EdgeNGramTokenizer ex
}
@Override
- public final void end() {
+ public void end() {
// set final offset
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
- }
-
- @Override
public void reset() throws IOException {
super.reset();
started = false;
- charsRead = 0;
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Fri Mar 23 17:56:23 2012
@@ -34,11 +34,11 @@ public final class NGramTokenizer extend
private int minGram, maxGram;
private int gramSize;
- private int pos = 0;
+ private int pos;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
- private boolean started = false;
+ private boolean started;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -99,7 +99,7 @@ public final class NGramTokenizer extend
/** Returns the next token in the stream, or null at EOS. */
@Override
- public final boolean incrementToken() throws IOException {
+ public boolean incrementToken() throws IOException {
clearAttributes();
if (!started) {
started = true;
@@ -115,6 +115,20 @@ public final class NGramTokenizer extend
charsRead += inc;
}
inStr = new String(chars, 0, charsRead).trim(); // remove any trailing empty strings
+
+ if (charsRead == chars.length) {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ char[] throwaway = new char[1024];
+ while(true) {
+ final int inc = input.read(throwaway, 0, throwaway.length);
+ if (inc == -1) {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
inLen = inStr.length();
if (inLen == 0) {
return false;
@@ -138,22 +152,16 @@ public final class NGramTokenizer extend
}
@Override
- public final void end() {
+ public void end() {
// set final offset
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
- }
-
- @Override
public void reset() throws IOException {
super.reset();
started = false;
pos = 0;
- charsRead = 0;
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Mar 23 17:56:23 2012
@@ -110,6 +110,7 @@ public class EdgeNGramTokenizerTest exte
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
Analyzer b = new ReusableAnalyzerBase() {
@Override
@@ -119,5 +120,6 @@ public class EdgeNGramTokenizerTest exte
}
};
checkRandomData(random, b, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, b, 200*RANDOM_MULTIPLIER, 8192);
}
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Mar 23 17:56:23 2012
@@ -100,5 +100,6 @@ public class NGramTokenizerTest extends
}
};
checkRandomData(random, a, 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random, a, 200*RANDOM_MULTIPLIER, 8192);
}
}
Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1304528&r1=1304527&r2=1304528&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Fri Mar 23 17:56:23 2012
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util._TestUtil;
/**
@@ -359,12 +360,22 @@ public abstract class BaseTokenStreamTes
}
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple) throws IOException {
+
+ final LineFileDocs docs = new LineFileDocs(random);
+
for (int i = 0; i < iterations; i++) {
String text;
- if (simple) {
- text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
+
+ if (random.nextInt(10) == 7) {
+ text = docs.nextDoc().get("body");
+ if (text.length() > maxWordLength) {
+ text = text.substring(0, maxWordLength);
+ }
} else {
- switch(_TestUtil.nextInt(random, 0, 4)) {
+ if (simple) {
+ text = random.nextBoolean() ? _TestUtil.randomSimpleString(random, maxWordLength) : _TestUtil.randomHtmlishString(random, maxWordLength);
+ } else {
+ switch(_TestUtil.nextInt(random, 0, 4)) {
case 0:
text = _TestUtil.randomSimpleString(random, maxWordLength);
break;
@@ -376,6 +387,7 @@ public abstract class BaseTokenStreamTes
break;
default:
text = _TestUtil.randomUnicodeString(random, maxWordLength);
+ }
}
}