You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/02/21 15:51:50 UTC
lucene-solr:master: LUCENE-7465: fix corner case in
SimplePattern/SplitTokenizer when lookahead hits end of input
Repository: lucene-solr
Updated Branches:
refs/heads/master ac38872a7 -> 2d03aa21a
LUCENE-7465: fix corner case in SimplePattern/SplitTokenizer when lookahead hits end of input
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/2d03aa21
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/2d03aa21
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/2d03aa21
Branch: refs/heads/master
Commit: 2d03aa21a2b674d36e201f6309e646f37771b73b
Parents: ac38872
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Feb 21 10:51:38 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Feb 21 10:51:38 2017 -0500
----------------------------------------------------------------------
.../analysis/pattern/SimplePatternSplitTokenizer.java | 9 ++++-----
.../lucene/analysis/pattern/SimplePatternTokenizer.java | 2 +-
.../analysis/pattern/TestSimplePatternSplitTokenizer.java | 10 ++++++++++
.../analysis/pattern/TestSimplePatternTokenizer.java | 10 ++++++++++
4 files changed, 25 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
index d2b10c1..a8a40b2 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
@@ -135,13 +135,12 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
} while (state != -1);
if (lastAcceptLength != -1) {
- // strip the trailing separater we just matched from the token:
- tokenUpto -= lastAcceptLength;
- // we found a token separator
+ // we found a token separator; strip the trailing separator we just matched from the token:
int extra = sepUpto - lastAcceptLength;
if (extra != 0) {
pushBack(extra);
}
+ tokenUpto -= lastAcceptLength;
if (tokenUpto > 0) {
fillToken(offsetStart);
return true;
@@ -187,14 +186,14 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {
tokenUpto -= count;
assert tokenUpto >= 0;
if (pendingLimit == 0) {
- if (bufferNextRead >= count) {
+ if (bufferLimit != -1 && bufferNextRead >= count) {
// optimize common case when the chars we are pushing back are still in the buffer
bufferNextRead -= count;
} else {
if (count > pendingChars.length) {
pendingChars = ArrayUtil.grow(pendingChars, count);
}
- System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+ System.arraycopy(termAtt.buffer(), tokenUpto, pendingChars, 0, count);
pendingLimit = count;
}
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
index 867b10a..ff882ef 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
@@ -172,7 +172,7 @@ public final class SimplePatternTokenizer extends Tokenizer {
private void pushBack(int count) {
if (pendingLimit == 0) {
- if (bufferNextRead >= count) {
+ if (bufferLimit != -1 && bufferNextRead >= count) {
// optimize common case when the chars we are pushing back are still in the buffer
bufferNextRead -= count;
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
index 5642c2b..b497a9a 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
@@ -270,4 +270,14 @@ public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
b.close();
}
+
+ public void testEndLookahead() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("(ab)+");
+ t.setReader(new StringReader("aba"));
+ assertTokenStreamContents(t,
+ new String[] { "a" },
+ new int[] { 2 },
+ new int[] { 3 },
+ 3);
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2d03aa21/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
index b566713..51e8c43 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
@@ -215,4 +215,14 @@ public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
b.close();
}
+
+ public void testEndLookahead() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer("(ab)+");
+ t.setReader(new StringReader("aba"));
+ assertTokenStreamContents(t,
+ new String[] { "ab" },
+ new int[] { 0 },
+ new int[] { 2 },
+ 3);
+ }
}