You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2017/01/16 10:17:18 UTC
[1/6] lucene-solr:master: add test that EdgeNGram filter keeps
payloads
Repository: lucene-solr
Updated Branches:
refs/heads/master 9aa78dcca -> c64a01158
add test that EdgeNGram filter keeps payloads
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/61e45283
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/61e45283
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/61e45283
Branch: refs/heads/master
Commit: 61e45283061ae486acc5882c5a770025c8291222
Parents: 987e265
Author: Nathan Gass <ga...@search.ch>
Authored: Mon Jan 9 14:59:31 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 12:14:26 2017 +0100
----------------------------------------------------------------------
.../lucene/analysis/ngram/TestNGramFilters.java | 22 ++++++++++++++++++++
1 file changed, 22 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/61e45283/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
index 1243352..b6f4405 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@@ -22,7 +22,10 @@ import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.util.BytesRef;
/**
* Simple tests to ensure the NGram filter factories are working.
@@ -123,6 +126,25 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
+
+ public void testEdgeNGramFilterPayload() throws Exception {
+ Reader reader = new StringReader("test|0.1");
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
+ stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
+
+ stream.reset();
+ while (stream.incrementToken()) {
+ PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
+ assertNotNull(payAttr);
+ BytesRef payData = payAttr.getPayload();
+ assertNotNull(payData);
+ float payFloat = PayloadHelper.decodeFloat(payData.bytes);
+ assertEquals(0.1f, payFloat, 0.0f);
+ }
+ stream.end();
+ stream.close();
+ }
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
[3/6] lucene-solr:master: use captureState and restoreState instead
of cloneAttributes
Posted by us...@apache.org.
use captureState and restoreState instead of cloneAttributes
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/01f2a87c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/01f2a87c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/01f2a87c
Branch: refs/heads/master
Commit: 01f2a87c67392a86b533d0c76ba7666845d1945f
Parents: 6570e6e
Author: Nathan Gass <ga...@search.ch>
Authored: Fri Jan 13 15:54:07 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 15:54:07 2017 +0100
----------------------------------------------------------------------
.../apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01f2a87c/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 303b7e320..df12fda 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.State;
/**
* Tokenizes the given token into n-grams of given size(s).
@@ -43,7 +43,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int curCodePointCount;
private int curGramSize;
private int savePosIncr;
- private AttributeSource attributes;
+ private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -81,15 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
- attributes = input.cloneAttributes();
+ state = captureState();
savePosIncr += posIncrAtt.getPositionIncrement();
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
- clearAttributes();
- attributes.copyTo(this);
+ restoreState(state);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
[2/6] lucene-solr:master: copy all attributes including payload to
new tokens
Posted by us...@apache.org.
copy all attributes including payload to new tokens
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6570e6ec
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6570e6ec
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6570e6ec
Branch: refs/heads/master
Commit: 6570e6ecc2b14a28da9873948083791ba47145d0
Parents: 61e4528
Author: Nathan Gass <ga...@search.ch>
Authored: Mon Jan 9 15:00:21 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 12:14:27 2017 +0100
----------------------------------------------------------------------
.../lucene/analysis/ngram/EdgeNGramTokenFilter.java | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6570e6ec/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 827e26f..303b7e320 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -22,9 +22,8 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource;
/**
* Tokenizes the given token into n-grams of given size(s).
@@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
- private int tokStart;
- private int tokEnd; // only used if the length changed before this filter
private int savePosIncr;
- private int savePosLen;
+ private AttributeSource attributes;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
/**
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@@ -86,17 +81,15 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
curTermLength = termAtt.length();
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
- tokStart = offsetAtt.startOffset();
- tokEnd = offsetAtt.endOffset();
+ attributes = input.cloneAttributes();
savePosIncr += posIncrAtt.getPositionIncrement();
- savePosLen = posLenAtt.getPositionLength();
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
clearAttributes();
- offsetAtt.setOffset(tokStart, tokEnd);
+ attributes.copyTo(this);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
@@ -104,7 +97,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
posIncrAtt.setPositionIncrement(0);
}
- posLenAtt.setPositionLength(savePosLen);
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
[6/6] lucene-solr:master: LUCENE-7630: Fix (Edge)NGramTokenFilter to
no longer drop payloads and preserve all attributes [merge branch
'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]
Posted by us...@apache.org.
LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes
[merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]
Signed-off-by: Uwe Schindler <us...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c64a0115
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c64a0115
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c64a0115
Branch: refs/heads/master
Commit: c64a01158e972176256e257d6c1d4629b05783a2
Parents: 9aa78dc ea049b9
Author: Uwe Schindler <us...@apache.org>
Authored: Mon Jan 16 11:16:43 2017 +0100
Committer: Uwe Schindler <us...@apache.org>
Committed: Mon Jan 16 11:16:43 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 7 +++
.../analysis/ngram/EdgeNGramTokenFilter.java | 17 ++-----
.../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++-----
.../lucene/analysis/ngram/TestNGramFilters.java | 47 ++++++++++++++++++++
4 files changed, 63 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c64a0115/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --cc lucene/CHANGES.txt
index 58201d6,4912920..530b0d4
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@@ -61,6 -58,6 +61,13 @@@ Othe
* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
++======================= Lucene 6.5.0 =======================
++
++Bug Fixes
++
++* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
++ and preserve all attributes. (Nathan Gass via Uwe Schindler)
++
======================= Lucene 6.4.0 =======================
API Changes
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c64a0115/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --cc lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 827e26f,df12fda..47b80ff
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@@ -95,8 -88,7 +88,7 @@@ public final class EdgeNGramTokenFilte
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
- clearAttributes();
- offsetAtt.setOffset(tokStart, tokEnd);
- restoreState(state);
++ restoreState(state);
// first ngram gets increment, others don't
if (curGramSize == minGram) {
posIncrAtt.setPositionIncrement(savePosIncr);
[4/6] lucene-solr:master: add comment and test for ngram token filter
Posted by us...@apache.org.
add comment and test for ngram token filter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/80e28542
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/80e28542
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/80e28542
Branch: refs/heads/master
Commit: 80e2854247cce485920b45acdeffa3e68bcea385
Parents: 01f2a87
Author: Nathan Gass <ga...@search.ch>
Authored: Fri Jan 13 16:42:41 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 16:42:41 2017 +0100
----------------------------------------------------------------------
.../lucene/analysis/ngram/TestNGramFilters.java | 25 ++++++++++++++++++++
1 file changed, 25 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/80e28542/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
index b6f4405..5de532f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@@ -80,6 +80,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
}
/**
+ * Test NGramFilterFactory on tokens with payloads
+ */
+ public void testNGramFilterPayload() throws Exception {
+ Reader reader = new StringReader("test|0.1");
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
+ stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
+
+ stream.reset();
+ while (stream.incrementToken()) {
+ PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
+ assertNotNull(payAttr);
+ BytesRef payData = payAttr.getPayload();
+ assertNotNull(payData);
+ float payFloat = PayloadHelper.decodeFloat(payData.bytes);
+ assertEquals(0.1f, payFloat, 0.0f);
+ }
+ stream.end();
+ stream.close();
+ }
+
+ /**
* Test EdgeNGramTokenizerFactory
*/
public void testEdgeNGramTokenizer() throws Exception {
@@ -127,6 +149,9 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
new String[] { "t", "te" });
}
+ /**
+ * Test EdgeNGramFilterFactory on tokens with payloads
+ */
public void testEdgeNGramFilterPayload() throws Exception {
Reader reader = new StringReader("test|0.1");
TokenStream stream = whitespaceMockTokenizer(reader);
[5/6] lucene-solr:master: also copy all attributes for ngram token
filters
Posted by us...@apache.org.
also copy all attributes for ngram token filters
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ea049b96
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ea049b96
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ea049b96
Branch: refs/heads/master
Commit: ea049b96a24d6afc582ecdf406e8bf256b9911d9
Parents: 80e2854
Author: Nathan Gass <ga...@search.ch>
Authored: Fri Jan 13 17:01:34 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 17:07:23 2017 +0100
----------------------------------------------------------------------
.../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++++--------------
1 file changed, 5 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ea049b96/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index e275cfa..cb5d447 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource.State;
/**
* Tokenizes the input into n-grams of the given size(s).
@@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
private int curCodePointCount;
private int curGramSize;
private int curPos;
- private int curPosInc, curPosLen;
- private int tokStart;
- private int tokEnd;
+ private int curPosInc;
+ private State state;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
- private final PositionLengthAttribute posLenAtt;
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Creates NGramTokenFilter with given min and max n-grams.
@@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
this.maxGram = maxGram;
posIncAtt = addAttribute(PositionIncrementAttribute.class);
- posLenAtt = addAttribute(PositionLengthAttribute.class);
}
/**
@@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
- curPosLen = posLenAtt.getPositionLength();
- tokStart = offsetAtt.startOffset();
- tokEnd = offsetAtt.endOffset();
+ state = captureState();
}
}
@@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
curGramSize = minGram;
}
if ((curPos + curGramSize) <= curCodePointCount) {
- clearAttributes();
+ restoreState(state);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
- posLenAtt.setPositionLength(curPosLen);
- offsetAtt.setOffset(tokStart, tokEnd);
curGramSize++;
return true;
}