You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/02/21 15:51:50 UTC
lucene-solr:master: LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input

Repository: lucene-solr
Updated Branches:
  refs/heads/master ac38872a7 -> 2d03aa21a


LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/2d03aa21
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/2d03aa21
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/2d03aa21

Branch: refs/heads/master
Commit: 2d03aa21a2b674d36e201f6309e646f37771b73b
Parents: ac38872
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Feb 21 10:51:38 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Feb 21 10:51:38 2017 -0500

----------------------------------------------------------------------
 .../analysis/pattern/SimplePatternSplitTokenizer.java     |  9 ++++-----
 .../lucene/analysis/pattern/SimplePatternTokenizer.java   |  2 +-
 .../analysis/pattern/TestSimplePatternSplitTokenizer.java | 10 ++++++++++
 .../analysis/pattern/TestSimplePatternTokenizer.java      | 10 ++++++++++
 4 files changed, 25 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
index d2b10c1..a8a40b2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
@@ -135,13 +135,12 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
         } while (state != -1);
         
         if (lastAcceptLength != -1) {
-          // strip the trailing separater we just matched from the token:
-          tokenUpto -= lastAcceptLength;
-          // we found a token separator
+          // we found a token separator; strip the trailing separator we just matched from the token:
           int extra = sepUpto - lastAcceptLength;
           if (extra != 0) {
             pushBack(extra);
           }
+          tokenUpto -= lastAcceptLength;
           if (tokenUpto > 0) {
             fillToken(offsetStart);
             return true;
@@ -187,14 +186,14 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
     tokenUpto -= count;
     assert tokenUpto >= 0;
     if (pendingLimit == 0) {
-      if (bufferNextRead >= count) {
+      if (bufferLimit != -1 && bufferNextRead >= count) {
         // optimize common case when the chars we are pushing back are still in the buffer
         bufferNextRead -= count;
       } else {
         if (count > pendingChars.length) {
           pendingChars = ArrayUtil.grow(pendingChars, count);
         }
-        System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+        System.arraycopy(termAtt.buffer(), tokenUpto, pendingChars, 0, count);
         pendingLimit = count;
       }
     } else {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
index 867b10a..ff882ef 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
@@ -172,7 +172,7 @@ public final class SimplePatternTokenizer extends Tokenizer {
   private void pushBack(int count) {
     
     if (pendingLimit == 0) {
-      if (bufferNextRead >= count) {
+      if (bufferLimit != -1 && bufferNextRead >= count) {
         // optimize common case when the chars we are pushing back are still in the buffer
         bufferNextRead -= count;
       } else {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
index 5642c2b..b497a9a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
@@ -270,4 +270,14 @@ public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
     checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
     b.close();
   }
+
+  public void testEndLookahead() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("(ab)+");
+    t.setReader(new StringReader("aba"));
+    assertTokenStreamContents(t,
+        new String[] { "a" },
+        new int[] { 2 },
+        new int[] { 3 },
+        3);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
index b566713..51e8c43 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
@@ -215,4 +215,14 @@ public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
     checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
     b.close();
   }
+
+  public void testEndLookahead() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("(ab)+");
+    t.setReader(new StringReader("aba"));
+    assertTokenStreamContents(t,
+        new String[] { "ab" },
+        new int[] { 0 },
+        new int[] { 2 },
+        3);
+  }
 }