You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/08/27 01:17:50 UTC
svn commit: r1620759 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/ngram/ analysis/common/src/test/org/apache/lucene/analysis/ngram/

Author: rjernst
Date: Tue Aug 26 23:17:49 2014
New Revision: 1620759

URL: http://svn.apache.org/r1620759
Log:
LUCENE-5908: Fix Lucene43NGramTokenizer to be final

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Aug 26 23:17:49 2014
@@ -292,6 +292,8 @@ Bug Fixes
   buffer size was reduced, and scanner buffer growth was disabled, resulting
   in much, much faster tokenization for these text sequences.  
   (Chris Geeringh, Robert Muir, Steve Rowe)
+
+* LUCENE-5908: Fix Lucene43NGramTokenizer to be final
   
 Test Framework
 

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43EdgeNGramTokenizer.java Tue Aug 26 23:17:49 2014
@@ -17,18 +17,105 @@ package org.apache.lucene.analysis.ngram
  * limitations under the License.
  */
 
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.Version;
 
 /**
- * Tokenizes the input from an edge into n-grams of given size(s), using pre-4.4 behavior.
- *
- * @deprecated Use {@link org.apache.lucene.analysis.ngram.EdgeNGramTokenizer}.
+ * Old version of {@link EdgeNGramTokenizer} which doesn't handle correctly
+ * supplementary characters.
  */
 @Deprecated
-public class Lucene43EdgeNGramTokenizer extends Lucene43NGramTokenizer {
+public final class Lucene43EdgeNGramTokenizer extends Tokenizer {
+  public static final Side DEFAULT_SIDE = Side.FRONT;
   public static final int DEFAULT_MAX_GRAM_SIZE = 1;
   public static final int DEFAULT_MIN_GRAM_SIZE = 1;
 
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+  /** Specifies which side of the input the n-gram should be generated from */
+  public static enum Side {
+
+    /** Get the n-gram from the front of the input */
+    FRONT {
+      @Override
+      public String getLabel() { return "front"; }
+    },
+
+    /** Get the n-gram from the end of the input */
+    BACK  {
+      @Override
+      public String getLabel() { return "back"; }
+    };
+
+    public abstract String getLabel();
+
+    // Get the appropriate Side from a string
+    public static Side getSide(String sideName) {
+      if (FRONT.getLabel().equals(sideName)) {
+        return FRONT;
+      }
+      if (BACK.getLabel().equals(sideName)) {
+        return BACK;
+      }
+      return null;
+    }
+  }
+
+  private int minGram;
+  private int maxGram;
+  private int gramSize;
+  private Side side;
+  private boolean started;
+  private int inLen; // length of the input AFTER trim()
+  private int charsRead; // length of the input
+  private String inStr;
+
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43EdgeNGramTokenizer(Side side, int minGram, int maxGram) {
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
+   * @param side the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43EdgeNGramTokenizer(AttributeFactory factory, Side side, int minGram, int maxGram) {
+    super(factory);
+    init(side, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
+   * @param sideLabel the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  public Lucene43EdgeNGramTokenizer(AttributeFactory factory, String sideLabel, int minGram, int maxGram) {
+    this(factory, Side.getSide(sideLabel), minGram, maxGram);
+  }
+
   /**
    * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
    *
@@ -36,7 +123,19 @@ public class Lucene43EdgeNGramTokenizer 
    * @param maxGram the largest n-gram to generate
    */
   public Lucene43EdgeNGramTokenizer(int minGram, int maxGram) {
-    super(minGram, maxGram);
+    this(Side.FRONT, minGram, maxGram);
+  }
+
+  /**
+   * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+   *
+   * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+   * @param minGram the smallest n-gram to generate
+   * @param maxGram the largest n-gram to generate
+   */
+  @Deprecated
+  public Lucene43EdgeNGramTokenizer(String sideLabel, int minGram, int maxGram) {
+    this(Side.getSide(sideLabel), minGram, maxGram);
   }
 
   /**
@@ -47,7 +146,110 @@ public class Lucene43EdgeNGramTokenizer 
    * @param maxGram the largest n-gram to generate
    */
   public Lucene43EdgeNGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
-    super(factory, minGram, maxGram);
+    this(factory, Side.FRONT, minGram, maxGram);
+  }
+
+  private void init(Side side, int minGram, int maxGram) {
+
+    if (side == null) {
+      throw new IllegalArgumentException("sideLabel must be either front or back");
+    }
+
+    if (minGram < 1) {
+      throw new IllegalArgumentException("minGram must be greater than zero");
+    }
+
+    if (minGram > maxGram) {
+      throw new IllegalArgumentException("minGram must not be greater than maxGram");
+    }
+
+    maxGram = Math.min(maxGram, 1024);
+
+    this.minGram = minGram;
+    this.maxGram = maxGram;
+    this.side = side;
   }
 
+  /** Returns the next token in the stream, or null at EOS. */
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    // if we are just starting, read the whole input
+    if (!started) {
+      started = true;
+      gramSize = minGram;
+      final int limit = side == Side.FRONT ? maxGram : 1024;
+      char[] chars = new char[Math.min(1024, limit)];
+      charsRead = 0;
+      // TODO: refactor to a shared readFully somewhere:
+      boolean exhausted = false;
+      while (charsRead < limit) {
+        final int inc = input.read(chars, charsRead, chars.length-charsRead);
+        if (inc == -1) {
+          exhausted = true;
+          break;
+        }
+        charsRead += inc;
+        if (charsRead == chars.length && charsRead < limit) {
+          chars = ArrayUtil.grow(chars);
+        }
+      }
+
+      inStr = new String(chars, 0, charsRead);
+      inStr = inStr.trim();
+
+      if (!exhausted) {
+        // Read extra throwaway chars so that on end() we
+        // report the correct offset:
+        char[] throwaway = new char[1024];
+        while(true) {
+          final int inc = input.read(throwaway, 0, throwaway.length);
+          if (inc == -1) {
+            break;
+          }
+          charsRead += inc;
+        }
+      }
+
+      inLen = inStr.length();
+      if (inLen == 0) {
+        return false;
+      }
+      posIncrAtt.setPositionIncrement(1);
+    } else {
+      posIncrAtt.setPositionIncrement(0);
+    }
+
+    // if the remaining input is too short, we can't generate any n-grams
+    if (gramSize > inLen) {
+      return false;
+    }
+
+    // if we have hit the end of our n-gram size range, quit
+    if (gramSize > maxGram || gramSize > inLen) {
+      return false;
+    }
+
+    // grab gramSize chars from front or back
+    int start = side == Side.FRONT ? 0 : inLen - gramSize;
+    int end = start + gramSize;
+    termAtt.setEmpty().append(inStr, start, end);
+    offsetAtt.setOffset(correctOffset(start), correctOffset(end));
+    gramSize++;
+    return true;
+  }
+  
+  @Override
+  public void end() throws IOException {
+    super.end();
+    // set final offset
+    final int finalOffset = correctOffset(charsRead);
+    this.offsetAtt.setOffset(finalOffset, finalOffset);
+  }    
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    started = false;
+  }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/Lucene43NGramTokenizer.java Tue Aug 26 23:17:49 2014
@@ -18,7 +18,7 @@ package org.apache.lucene.analysis.ngram
  */
 
 import java.io.IOException;
-import java.io.Reader;
+
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -29,7 +29,7 @@ import org.apache.lucene.util.AttributeF
  * Old broken version of {@link NGramTokenizer}.
  */
 @Deprecated
-public class Lucene43NGramTokenizer extends Tokenizer {
+public final class Lucene43NGramTokenizer extends Tokenizer {
   public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
   public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
 

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Tue Aug 26 23:17:49 2014
@@ -235,4 +235,8 @@ public class EdgeNGramTokenFilterTest ex
     assertFalse(tk.incrementToken());
   }
 
+  public void test43Tokenizer() {
+    new Lucene43EdgeNGramTokenizer(1, 1);
+  }
+
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1620759&r1=1620758&r2=1620759&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Tue Aug 26 23:17:49 2014
@@ -246,4 +246,9 @@ public class NGramTokenizerTest extends 
     testNGrams(minGram, maxGram, s, "abcdef");
   }
 
+  public void test43Tokenizer() {
+    // TODO: do more than instantiate (ie check the old broken behavior)
+    new Lucene43NGramTokenizer(1, 1);
+  }
+
 }