You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/08/27 01:17:50 UTC
svn commit: r1620759 - in /lucene/dev/trunk/lucene: ./
analysis/common/src/java/org/apache/lucene/analysis/ngram/
analysis/common/src/test/org/apache/lucene/analysis/ngram/
Author: rjernst
Date: Tue Aug 26 23:17:49 2014
New Revision: 1620759
URL: http://svn.apache.org/r1620759
Log:
LUCENE-5908: Fix Lucene43NGramTokenizer to be final
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Aug 26 23:17:49 2014
@@ -292,6 +292,8 @@ Bug Fixes
buffer size was reduced, and scanner buffer growth was disabled, resulting
in much, much faster tokenization for these text sequences.
(Chris Geeringh, Robert Muir, Steve Rowe)
+
+* LUCENE-5908: Fix Lucene43NGramTokenizer to be final
Test Framework
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java Tue Aug 26 23:17:49 2014
@@ -17,18 +17,105 @@ package org.apache.lucene.analysis.ngram
* limitations under the License.
*/
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.Version;
/**
- * Tokenizes the input from an edge into n-grams of given size(s), using pre-4.4 behavior.
- *
- * @deprecated Use {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer}.
+ * Old version of {@link EdgeNGramTokenizer} which doesn't handle correctly
+ * supplementary characters.
*/
@Deprecated
-public class Lucene43EdgeNGramTokenizer extends Lucene43NGramTokenizer {
+public final class Lucene43EdgeNGramTokenizer extends Tokenizer {
+ public static final Side DEFAULT_SIDE = Side.FRONT;
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+ /** Specifies which side of the input the n-gram should be generated from */
+ public static enum Side {
+
+ /** Get the n-gram from the front of the input */
+ FRONT {
+ @Override
+ public String getLabel() { return "front"; }
+ },
+
+ /** Get the n-gram from the end of the input */
+ BACK {
+ @Override
+ public String getLabel() { return "back"; }
+ };
+
+ public abstract String getLabel();
+
+ // Get the appropriate Side from a string
+ public static Side getSide(String sideName) {
+ if (FRONT.getLabel().equals(sideName)) {
+ return FRONT;
+ }
+ if (BACK.getLabel().equals(sideName)) {
+ return BACK;
+ }
+ return null;
+ }
+ }
+
+ private int minGram;
+ private int maxGram;
+ private int gramSize;
+ private Side side;
+ private boolean started;
+ private int inLen; // length of the input AFTER trim()
+ private int charsRead; // length of the input
+ private String inStr;
+
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public Lucene43EdgeNGramTokenizer(Side side, int minGram, int maxGram) {
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
+ * @param side the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public Lucene43EdgeNGramTokenizer(AttributeFactory factory, Side side, int minGram, int maxGram) {
+ super(factory);
+ init(side, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
+ * @param sideLabel the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public Lucene43EdgeNGramTokenizer(AttributeFactory factory, String sideLabel, int minGram, int maxGram) {
+ this(factory, Side.getSide(sideLabel), minGram, maxGram);
+ }
+
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
@@ -36,7 +123,19 @@ public class Lucene43EdgeNGramTokenizer
* @param maxGram the largest n-gram to generate
*/
public Lucene43EdgeNGramTokenizer(int minGram, int maxGram) {
- super(minGram, maxGram);
+ this(Side.FRONT, minGram, maxGram);
+ }
+
+ /**
+ * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+ *
+ * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ @Deprecated
+ public Lucene43EdgeNGramTokenizer(String sideLabel, int minGram, int maxGram) {
+ this(Side.getSide(sideLabel), minGram, maxGram);
}
/**
@@ -47,7 +146,110 @@ public class Lucene43EdgeNGramTokenizer
* @param maxGram the largest n-gram to generate
*/
public Lucene43EdgeNGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
- super(factory, minGram, maxGram);
+ this(factory, Side.FRONT, minGram, maxGram);
+ }
+
+ private void init(Side side, int minGram, int maxGram) {
+
+ if (side == null) {
+ throw new IllegalArgumentException("sideLabel must be either front or back");
+ }
+
+ if (minGram < 1) {
+ throw new IllegalArgumentException("minGram must be greater than zero");
+ }
+
+ if (minGram > maxGram) {
+ throw new IllegalArgumentException("minGram must not be greater than maxGram");
+ }
+
+ maxGram = Math.min(maxGram, 1024);
+
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ this.side = side;
}
+ /** Returns the next token in the stream, or null at EOS. */
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+ // if we are just starting, read the whole input
+ if (!started) {
+ started = true;
+ gramSize = minGram;
+ final int limit = side == Side.FRONT ? maxGram : 1024;
+ char[] chars = new char[Math.min(1024, limit)];
+ charsRead = 0;
+ // TODO: refactor to a shared readFully somewhere:
+ boolean exhausted = false;
+ while (charsRead < limit) {
+ final int inc = input.read(chars, charsRead, chars.length-charsRead);
+ if (inc == -1) {
+ exhausted = true;
+ break;
+ }
+ charsRead += inc;
+ if (charsRead == chars.length && charsRead < limit) {
+ chars = ArrayUtil.grow(chars);
+ }
+ }
+
+ inStr = new String(chars, 0, charsRead);
+ inStr = inStr.trim();
+
+ if (!exhausted) {
+ // Read extra throwaway chars so that on end() we
+ // report the correct offset:
+ char[] throwaway = new char[1024];
+ while(true) {
+ final int inc = input.read(throwaway, 0, throwaway.length);
+ if (inc == -1) {
+ break;
+ }
+ charsRead += inc;
+ }
+ }
+
+ inLen = inStr.length();
+ if (inLen == 0) {
+ return false;
+ }
+ posIncrAtt.setPositionIncrement(1);
+ } else {
+ posIncrAtt.setPositionIncrement(0);
+ }
+
+ // if the remaining input is too short, we can't generate any n-grams
+ if (gramSize > inLen) {
+ return false;
+ }
+
+ // if we have hit the end of our n-gram size range, quit
+ if (gramSize > maxGram || gramSize > inLen) {
+ return false;
+ }
+
+ // grab gramSize chars from front or back
+ int start = side == Side.FRONT ? 0 : inLen - gramSize;
+ int end = start + gramSize;
+ termAtt.setEmpty().append(inStr, start, end);
+ offsetAtt.setOffset(correctOffset(start), correctOffset(end));
+ gramSize++;
+ return true;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ // set final offset
+ final int finalOffset = correctOffset(charsRead);
+ this.offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ started = false;
+ }
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java Tue Aug 26 23:17:49 2014
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.ngram
*/
import java.io.IOException;
-import java.io.Reader;
+
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -29,7 +29,7 @@ import org.apache.lucene.util.AttributeF
* Old broken version of {@link NGramTokenizer}.
*/
@Deprecated
-public class Lucene43NGramTokenizer extends Tokenizer {
+public final class Lucene43NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Tue Aug 26 23:17:49 2014
@@ -235,4 +235,8 @@ public class EdgeNGramTokenFilterTest ex
assertFalse(tk.incrementToken());
}
+ public void test43Tokenizer() {
+ new Lucene43EdgeNGramTokenizer(1, 1);
+ }
+
}
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Tue Aug 26 23:17:49 2014
@@ -246,4 +246,9 @@ public class NGramTokenizerTest extends
testNGrams(minGram, maxGram, s, "abcdef");
}
+ public void test43Tokenizer() {
+ // TODO: do more than instantiate (ie check the old broken behavior)
+ new Lucene43NGramTokenizer(1, 1);
+ }
+
}