You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2017/01/16 10:27:12 UTC
lucene-solr:branch_6x: LUCENE-7630: Fix (Edge)NGramTokenFilter to no
longer drop payloads and preserve all attributes [merge branch 'edgepayloads'
from Nathan Gass https://github.com/xabbu42/lucene-solr]
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x b5b17b23c -> a69c632aa
LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes
[merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]
Signed-off-by: Uwe Schindler <us...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/a69c632a
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/a69c632a
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/a69c632a
Branch: refs/heads/branch_6x
Commit: a69c632aa54d064515152145bcbcbe1e869d7061
Parents: b5b17b2
Author: Uwe Schindler <us...@apache.org>
Authored: Mon Jan 16 11:16:43 2017 +0100
Committer: Uwe Schindler <us...@apache.org>
Committed: Mon Jan 16 11:24:55 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 7 +++
.../analysis/ngram/EdgeNGramTokenFilter.java | 17 ++-----
.../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++-----
.../lucene/analysis/ngram/TestNGramFilters.java | 47 ++++++++++++++++++++
4 files changed, 63 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5de3bab..af0ff77 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -6,6 +6,13 @@ http://s.apache.org/luceneversions
======================= Lucene 6.5.0 =======================
(No Changes)
+======================= Lucene 6.5.0 =======================
+
+Bug Fixes
+
+* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
+ and preserve all attributes. (Nathan Gass via Uwe Schindler)
+
======================= Lucene 6.4.0 =======================
API Changes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 827e26f..47b80ff 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -22,9 +22,8 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource.State;
/**
* Tokenizes the given token into n-grams of given size(s).
@@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
- private int tokStart;
- private int tokEnd; // only used if the length changed before this filter
private int savePosIncr;
- private int savePosLen;
+ private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
- tokStart = offsetAtt.startOffset();
- tokEnd = offsetAtt.endOffset();
+ state = captureState();
savePosIncr += posIncrAtt.getPositionIncrement();
- savePosLen = posLenAtt.getPositionLength();
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
- clearAttributes();
- offsetAtt.setOffset(tokStart, tokEnd);
+ restoreState(state);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
@@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
posIncrAtt.setPositionIncrement(0);
}
- posLenAtt.setPositionLength(savePosLen);
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index e275cfa..cb5d447 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource.State;
/**
* Tokenizes the input into n-grams of the given size(s).
@@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
private int curCodePointCount;
private int curGramSize;
private int curPos;
- private int curPosInc, curPosLen;
- private int tokStart;
- private int tokEnd;
+ private int curPosInc;
+ private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
- private final PositionLengthAttribute posLenAtt;
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Creates NGramTokenFilter with given min and max n-grams.
@@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
this.maxGram = maxGram;
posIncAtt = addAttribute(PositionIncrementAttribute.class);
- posLenAtt = addAttribute(PositionLengthAttribute.class);
}
/**
@@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
- curPosLen = posLenAtt.getPositionLength();
- tokStart = offsetAtt.startOffset();
- tokEnd = offsetAtt.endOffset();
+ state = captureState();
}
}
@@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
}
if ((curPos + curGramSize) <= curCodePointCount) {
- clearAttributes();
+ restoreState(state);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
- posLenAtt.setPositionLength(curPosLen);
- offsetAtt.setOffset(tokStart, tokEnd);
curGramSize++;
return true;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
index 1243352..5de532f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@@ -22,7 +22,10 @@ import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.util.BytesRef;
/**
* Simple tests to ensure the NGram filter factories are working.
@@ -77,6 +80,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
}
/**
+ * Test NGramFilterFactory on tokens with payloads
+ */
+ public void testNGramFilterPayload() throws Exception {
+ Reader reader = new StringReader("test|0.1");
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
+ stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
+
+ stream.reset();
+ while (stream.incrementToken()) {
+ PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
+ assertNotNull(payAttr);
+ BytesRef payData = payAttr.getPayload();
+ assertNotNull(payData);
+ float payFloat = PayloadHelper.decodeFloat(payData.bytes);
+ assertEquals(0.1f, payFloat, 0.0f);
+ }
+ stream.end();
+ stream.close();
+ }
+
+ /**
* Test EdgeNGramTokenizerFactory
*/
public void testEdgeNGramTokenizer() throws Exception {
@@ -123,6 +148,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
+
+ /**
+ * Test EdgeNGramFilterFactory on tokens with payloads
+ */
+ public void testEdgeNGramFilterPayload() throws Exception {
+ Reader reader = new StringReader("test|0.1");
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
+ stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
+
+ stream.reset();
+ while (stream.incrementToken()) {
+ PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
+ assertNotNull(payAttr);
+ BytesRef payData = payAttr.getPayload();
+ assertNotNull(payData);
+ float payFloat = PayloadHelper.decodeFloat(payData.bytes);
+ assertEquals(0.1f, payFloat, 0.0f);
+ }
+ stream.end();
+ stream.close();
+ }
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
Re: lucene-solr:branch_6x: LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes [merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]
Posted by Alan Woodward <al...@flax.co.uk>.
Oh, I see, it’s because TokenFilter extends AttributeSource, so the import is unnecessary. Will push a fix as part of LUCENE-7623.
Alan Woodward
www.flax.co.uk
> On 16 Jan 2017, at 11:26, Alan Woodward <al...@flax.co.uk> wrote:
>
> This is making precommit fail for me locally:
>
> -ecj-javadoc-lint-src:
> [mkdir] Created dir: /var/folders/16/hgq2wtys7nv1_x9st6mdpwzh0000gp/T/ecj662445789
> [ecj-lint] Compiling 453 source files to /var/folders/16/hgq2wtys7nv1_x9st6mdpwzh0000gp/T/ecj662445789
> [ecj-lint] ----------
> [ecj-lint] 1. ERROR in /Users/woody/asf/lucene-solr-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (at line 26)
> [ecj-lint] import org.apache.lucene.util.AttributeSource.State;
> [ecj-lint] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> [ecj-lint] The import org.apache.lucene.util.AttributeSource.State is never used
> [ecj-lint] —————
>
> Which is confusing as hell, because the import clearly *is* used. And removing the import fixes things, even though it shouldn’t then compile.
>
> Alan Woodward
> www.flax.co.uk <http://www.flax.co.uk/>
>
>
>> On 16 Jan 2017, at 10:27, uschindler@apache.org <ma...@apache.org> wrote:
>>
>> Repository: lucene-solr
>> Updated Branches:
>> refs/heads/branch_6x b5b17b23c -> a69c632aa
>>
>>
>> LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes
>> [merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr] <https://github.com/xabbu42/lucene-solr]>
>>
>> Signed-off-by: Uwe Schindler <uschindler@apache.org <ma...@apache.org>>
>>
>>
>> Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo <http://git-wip-us.apache.org/repos/asf/lucene-solr/repo>
>> Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/a69c632a <http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/a69c632a>
>> Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/a69c632a <http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/a69c632a>
>> Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/a69c632a <http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/a69c632a>
>>
>> Branch: refs/heads/branch_6x
>> Commit: a69c632aa54d064515152145bcbcbe1e869d7061
>> Parents: b5b17b2
>> Author: Uwe Schindler <uschindler@apache.org <ma...@apache.org>>
>> Authored: Mon Jan 16 11:16:43 2017 +0100
>> Committer: Uwe Schindler <uschindler@apache.org <ma...@apache.org>>
>> Committed: Mon Jan 16 11:24:55 2017 +0100
>>
>> ----------------------------------------------------------------------
>> lucene/CHANGES.txt | 7 +++
>> .../analysis/ngram/EdgeNGramTokenFilter.java | 17 ++-----
>> .../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++-----
>> .../lucene/analysis/ngram/TestNGramFilters.java | 47 ++++++++++++++++++++
>> 4 files changed, 63 insertions(+), 27 deletions(-)
>> ----------------------------------------------------------------------
>>
>>
>> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/CHANGES.txt <http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/CHANGES.txt>
>> ----------------------------------------------------------------------
>> diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
>> index 5de3bab..af0ff77 100644
>> --- a/lucene/CHANGES.txt
>> +++ b/lucene/CHANGES.txt
>> @@ -6,6 +6,13 @@ http://s.apache.org/luceneversions
>> ======================= Lucene 6.5.0 =======================
>> (No Changes)
>>
>> +======================= Lucene 6.5.0 =======================
>> +
>> +Bug Fixes
>> +
>> +* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
>> + and preserve all attributes. (Nathan Gass via Uwe Schindler)
>> +
>> ======================= Lucene 6.4.0 =======================
>>
>> API Changes
>>
>> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
>> ----------------------------------------------------------------------
>> diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
>> index 827e26f..47b80ff 100644
>> --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
>> +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
>> @@ -22,9 +22,8 @@ import java.io.IOException;
>> import org.apache.lucene.analysis.TokenFilter;
>> import org.apache.lucene.analysis.TokenStream;
>> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>> -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
>> import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
>> -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
>> +import org.apache.lucene.util.AttributeSource.State;
>>
>> /**
>> * Tokenizes the given token into n-grams of given size(s).
>> @@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
>> private int curTermLength;
>> private int curCodePointCount;
>> private int curGramSize;
>> - private int tokStart;
>> - private int tokEnd; // only used if the length changed before this filter
>> private int savePosIncr;
>> - private int savePosLen;
>> + private State state;
>>
>> private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
>> - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
>> private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
>> - private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
>>
>> /**
>> * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
>> @@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
>> curTermLength = termAtt.length();
>> curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
>> curGramSize = minGram;
>> - tokStart = offsetAtt.startOffset();
>> - tokEnd = offsetAtt.endOffset();
>> + state = captureState();
>> savePosIncr += posIncrAtt.getPositionIncrement();
>> - savePosLen = posLenAtt.getPositionLength();
>> }
>> }
>> if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
>> if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
>> // grab gramSize chars from front or back
>> - clearAttributes();
>> - offsetAtt.setOffset(tokStart, tokEnd);
>> + restoreState(state);
>> // first ngram gets increment, others don't
>> if (curGramSize == minGram) {
>> posIncrAtt.setPositionIncrement(savePosIncr);
>> @@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
>> } else {
>> posIncrAtt.setPositionIncrement(0);
>> }
>> - posLenAtt.setPositionLength(savePosLen);
>> final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
>> termAtt.copyBuffer(curTermBuffer, 0, charLength);
>> curGramSize++;
>>
>> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
>> ----------------------------------------------------------------------
>> diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
>> index e275cfa..cb5d447 100644
>> --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
>> +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
>> @@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
>> import org.apache.lucene.analysis.TokenStream;
>> import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
>> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>> -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
>> import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
>> -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
>> +import org.apache.lucene.util.AttributeSource.State;
>>
>> /**
>> * Tokenizes the input into n-grams of the given size(s).
>> @@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
>> private int curCodePointCount;
>> private int curGramSize;
>> private int curPos;
>> - private int curPosInc, curPosLen;
>> - private int tokStart;
>> - private int tokEnd;
>> + private int curPosInc;
>> + private State state;
>>
>> private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
>> private final PositionIncrementAttribute posIncAtt;
>> - private final PositionLengthAttribute posLenAtt;
>> - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
>>
>> /**
>> * Creates NGramTokenFilter with given min and max n-grams.
>> @@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
>> this.maxGram = maxGram;
>>
>> posIncAtt = addAttribute(PositionIncrementAttribute.class);
>> - posLenAtt = addAttribute(PositionLengthAttribute.class);
>> }
>>
>> /**
>> @@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
>> curGramSize = minGram;
>> curPos = 0;
>> curPosInc = posIncAtt.getPositionIncrement();
>> - curPosLen = posLenAtt.getPositionLength();
>> - tokStart = offsetAtt.startOffset();
>> - tokEnd = offsetAtt.endOffset();
>> + state = captureState();
>> }
>> }
>>
>> @@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
>> curGramSize = minGram;
>> }
>> if ((curPos + curGramSize) <= curCodePointCount) {
>> - clearAttributes();
>> + restoreState(state);
>> final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
>> final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
>> termAtt.copyBuffer(curTermBuffer, start, end - start);
>> posIncAtt.setPositionIncrement(curPosInc);
>> curPosInc = 0;
>> - posLenAtt.setPositionLength(curPosLen);
>> - offsetAtt.setOffset(tokStart, tokEnd);
>> curGramSize++;
>> return true;
>> }
>>
>> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
>> ----------------------------------------------------------------------
>> diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
>> index 1243352..5de532f 100644
>> --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
>> +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
>> @@ -22,7 +22,10 @@ import java.io.StringReader;
>>
>> import org.apache.lucene.analysis.TokenStream;
>> import org.apache.lucene.analysis.Tokenizer;
>> +import org.apache.lucene.analysis.payloads.PayloadHelper;
>> +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
>> import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
>> +import org.apache.lucene.util.BytesRef;
>>
>> /**
>> * Simple tests to ensure the NGram filter factories are working.
>> @@ -77,6 +80,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
>> }
>>
>> /**
>> + * Test NGramFilterFactory on tokens with payloads
>> + */
>> + public void testNGramFilterPayload() throws Exception {
>> + Reader reader = new StringReader("test|0.1");
>> + TokenStream stream = whitespaceMockTokenizer(reader);
>> + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
>> + stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
>> +
>> + stream.reset();
>> + while (stream.incrementToken()) {
>> + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
>> + assertNotNull(payAttr);
>> + BytesRef payData = payAttr.getPayload();
>> + assertNotNull(payData);
>> + float payFloat = PayloadHelper.decodeFloat(payData.bytes);
>> + assertEquals(0.1f, payFloat, 0.0f);
>> + }
>> + stream.end();
>> + stream.close();
>> + }
>> +
>> + /**
>> * Test EdgeNGramTokenizerFactory
>> */
>> public void testEdgeNGramTokenizer() throws Exception {
>> @@ -123,6 +148,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
>> assertTokenStreamContents(stream,
>> new String[] { "t", "te" });
>> }
>> +
>> + /**
>> + * Test EdgeNGramFilterFactory on tokens with payloads
>> + */
>> + public void testEdgeNGramFilterPayload() throws Exception {
>> + Reader reader = new StringReader("test|0.1");
>> + TokenStream stream = whitespaceMockTokenizer(reader);
>> + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
>> + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
>> +
>> + stream.reset();
>> + while (stream.incrementToken()) {
>> + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
>> + assertNotNull(payAttr);
>> + BytesRef payData = payAttr.getPayload();
>> + assertNotNull(payData);
>> + float payFloat = PayloadHelper.decodeFloat(payData.bytes);
>> + assertEquals(0.1f, payFloat, 0.0f);
>> + }
>> + stream.end();
>> + stream.close();
>> + }
>>
>> /** Test that bogus arguments result in exception */
>> public void testBogusArguments() throws Exception {
>>
>
Re: lucene-solr:branch_6x: LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes [merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]
Posted by Alan Woodward <al...@flax.co.uk>.
This is making precommit fail for me locally:
-ecj-javadoc-lint-src:
[mkdir] Created dir: /var/folders/16/hgq2wtys7nv1_x9st6mdpwzh0000gp/T/ecj662445789
[ecj-lint] Compiling 453 source files to /var/folders/16/hgq2wtys7nv1_x9st6mdpwzh0000gp/T/ecj662445789
[ecj-lint] ----------
[ecj-lint] 1. ERROR in /Users/woody/asf/lucene-solr-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (at line 26)
[ecj-lint] import org.apache.lucene.util.AttributeSource.State;
[ecj-lint] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[ecj-lint] The import org.apache.lucene.util.AttributeSource.State is never used
[ecj-lint] —————
Which is confusing as hell, because the import clearly *is* used. And removing the import fixes things, even though it shouldn’t then compile.
Alan Woodward
www.flax.co.uk
> On 16 Jan 2017, at 10:27, uschindler@apache.org wrote:
>
> Repository: lucene-solr
> Updated Branches:
> refs/heads/branch_6x b5b17b23c -> a69c632aa
>
>
> LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes
> [merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]
>
> Signed-off-by: Uwe Schindler <us...@apache.org>
>
>
> Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
> Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/a69c632a
> Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/a69c632a
> Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/a69c632a
>
> Branch: refs/heads/branch_6x
> Commit: a69c632aa54d064515152145bcbcbe1e869d7061
> Parents: b5b17b2
> Author: Uwe Schindler <us...@apache.org>
> Authored: Mon Jan 16 11:16:43 2017 +0100
> Committer: Uwe Schindler <us...@apache.org>
> Committed: Mon Jan 16 11:24:55 2017 +0100
>
> ----------------------------------------------------------------------
> lucene/CHANGES.txt | 7 +++
> .../analysis/ngram/EdgeNGramTokenFilter.java | 17 ++-----
> .../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++-----
> .../lucene/analysis/ngram/TestNGramFilters.java | 47 ++++++++++++++++++++
> 4 files changed, 63 insertions(+), 27 deletions(-)
> ----------------------------------------------------------------------
>
>
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/CHANGES.txt
> ----------------------------------------------------------------------
> diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
> index 5de3bab..af0ff77 100644
> --- a/lucene/CHANGES.txt
> +++ b/lucene/CHANGES.txt
> @@ -6,6 +6,13 @@ http://s.apache.org/luceneversions
> ======================= Lucene 6.5.0 =======================
> (No Changes)
>
> +======================= Lucene 6.5.0 =======================
> +
> +Bug Fixes
> +
> +* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
> + and preserve all attributes. (Nathan Gass via Uwe Schindler)
> +
> ======================= Lucene 6.4.0 =======================
>
> API Changes
>
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
> ----------------------------------------------------------------------
> diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
> index 827e26f..47b80ff 100644
> --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
> +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
> @@ -22,9 +22,8 @@ import java.io.IOException;
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
> import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
> -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
> +import org.apache.lucene.util.AttributeSource.State;
>
> /**
> * Tokenizes the given token into n-grams of given size(s).
> @@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
> private int curTermLength;
> private int curCodePointCount;
> private int curGramSize;
> - private int tokStart;
> - private int tokEnd; // only used if the length changed before this filter
> private int savePosIncr;
> - private int savePosLen;
> + private State state;
>
> private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
> - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
> private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
> - private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
>
> /**
> * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
> @@ -86,17 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
> curTermLength = termAtt.length();
> curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
> curGramSize = minGram;
> - tokStart = offsetAtt.startOffset();
> - tokEnd = offsetAtt.endOffset();
> + state = captureState();
> savePosIncr += posIncrAtt.getPositionIncrement();
> - savePosLen = posLenAtt.getPositionLength();
> }
> }
> if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
> if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
> // grab gramSize chars from front or back
> - clearAttributes();
> - offsetAtt.setOffset(tokStart, tokEnd);
> + restoreState(state);
> // first ngram gets increment, others don't
> if (curGramSize == minGram) {
> posIncrAtt.setPositionIncrement(savePosIncr);
> @@ -104,7 +96,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
> } else {
> posIncrAtt.setPositionIncrement(0);
> }
> - posLenAtt.setPositionLength(savePosLen);
> final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
> termAtt.copyBuffer(curTermBuffer, 0, charLength);
> curGramSize++;
>
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
> ----------------------------------------------------------------------
> diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
> index e275cfa..cb5d447 100644
> --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
> +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
> @@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
> import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
> -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
> +import org.apache.lucene.util.AttributeSource.State;
>
> /**
> * Tokenizes the input into n-grams of the given size(s).
> @@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
> private int curCodePointCount;
> private int curGramSize;
> private int curPos;
> - private int curPosInc, curPosLen;
> - private int tokStart;
> - private int tokEnd;
> + private int curPosInc;
> + private State state;
>
> private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
> private final PositionIncrementAttribute posIncAtt;
> - private final PositionLengthAttribute posLenAtt;
> - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
>
> /**
> * Creates NGramTokenFilter with given min and max n-grams.
> @@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
> this.maxGram = maxGram;
>
> posIncAtt = addAttribute(PositionIncrementAttribute.class);
> - posLenAtt = addAttribute(PositionLengthAttribute.class);
> }
>
> /**
> @@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
> curGramSize = minGram;
> curPos = 0;
> curPosInc = posIncAtt.getPositionIncrement();
> - curPosLen = posLenAtt.getPositionLength();
> - tokStart = offsetAtt.startOffset();
> - tokEnd = offsetAtt.endOffset();
> + state = captureState();
> }
> }
>
> @@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
> curGramSize = minGram;
> }
> if ((curPos + curGramSize) <= curCodePointCount) {
> - clearAttributes();
> + restoreState(state);
> final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
> final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
> termAtt.copyBuffer(curTermBuffer, start, end - start);
> posIncAtt.setPositionIncrement(curPosInc);
> curPosInc = 0;
> - posLenAtt.setPositionLength(curPosLen);
> - offsetAtt.setOffset(tokStart, tokEnd);
> curGramSize++;
> return true;
> }
>
> http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/a69c632a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
> ----------------------------------------------------------------------
> diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
> index 1243352..5de532f 100644
> --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
> +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
> @@ -22,7 +22,10 @@ import java.io.StringReader;
>
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Tokenizer;
> +import org.apache.lucene.analysis.payloads.PayloadHelper;
> +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
> import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
> +import org.apache.lucene.util.BytesRef;
>
> /**
> * Simple tests to ensure the NGram filter factories are working.
> @@ -77,6 +80,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
> }
>
> /**
> + * Test NGramFilterFactory on tokens with payloads
> + */
> + public void testNGramFilterPayload() throws Exception {
> + Reader reader = new StringReader("test|0.1");
> + TokenStream stream = whitespaceMockTokenizer(reader);
> + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
> + stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
> +
> + stream.reset();
> + while (stream.incrementToken()) {
> + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
> + assertNotNull(payAttr);
> + BytesRef payData = payAttr.getPayload();
> + assertNotNull(payData);
> + float payFloat = PayloadHelper.decodeFloat(payData.bytes);
> + assertEquals(0.1f, payFloat, 0.0f);
> + }
> + stream.end();
> + stream.close();
> + }
> +
> + /**
> * Test EdgeNGramTokenizerFactory
> */
> public void testEdgeNGramTokenizer() throws Exception {
> @@ -123,6 +148,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
> assertTokenStreamContents(stream,
> new String[] { "t", "te" });
> }
> +
> + /**
> + * Test EdgeNGramFilterFactory on tokens with payloads
> + */
> + public void testEdgeNGramFilterPayload() throws Exception {
> + Reader reader = new StringReader("test|0.1");
> + TokenStream stream = whitespaceMockTokenizer(reader);
> + stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
> + stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
> +
> + stream.reset();
> + while (stream.incrementToken()) {
> + PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
> + assertNotNull(payAttr);
> + BytesRef payData = payAttr.getPayload();
> + assertNotNull(payData);
> + float payFloat = PayloadHelper.decodeFloat(payData.bytes);
> + assertEquals(0.1f, payFloat, 0.0f);
> + }
> + stream.end();
> + stream.close();
> + }
>
> /** Test that bogus arguments result in exception */
> public void testBogusArguments() throws Exception {
>