You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2017/01/16 10:17:18 UTC

[1/6] lucene-solr:master: add test that EdgeNGram filter keeps payloads

Repository: lucene-solr
Updated Branches:
  refs/heads/master 9aa78dcca -> c64a01158


add test that EdgeNGram filter keeps payloads


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/61e45283
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/61e45283
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/61e45283

Branch: refs/heads/master
Commit: 61e45283061ae486acc5882c5a770025c8291222
Parents: 987e265
Author: Nathan Gass <ga...@search.ch>
Authored: Mon Jan 9 14:59:31 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 12:14:26 2017 +0100

----------------------------------------------------------------------
 .../lucene/analysis/ngram/TestNGramFilters.java | 22 ++++++++++++++++++++
 1 file changed, 22 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/61e45283/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
index 1243352..b6f4405 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@@ -22,7 +22,10 @@ import java.io.StringReader;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.util.BytesRef;
 
 /**
  * Simple tests to ensure the NGram filter factories are working.
@@ -123,6 +126,25 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
     assertTokenStreamContents(stream, 
         new String[] { "t", "te" });
   }
+
+  public void testEdgeNGramFilterPayload() throws Exception {
+    Reader reader = new StringReader("test|0.1");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
+    stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
+
+    stream.reset();
+    while (stream.incrementToken()) {
+      PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
+      assertNotNull(payAttr);
+      BytesRef payData = payAttr.getPayload();
+      assertNotNull(payData);
+      float payFloat = PayloadHelper.decodeFloat(payData.bytes);
+      assertEquals(0.1f, payFloat, 0.0f);
+    }
+    stream.end();
+    stream.close();
+  }
   
   /** Test that bogus arguments result in exception */
   public void testBogusArguments() throws Exception {


[3/6] lucene-solr:master: use captureState and restoreState instead of cloneAttributes

Posted by us...@apache.org.
use captureState and restoreState instead of cloneAttributes


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/01f2a87c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/01f2a87c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/01f2a87c

Branch: refs/heads/master
Commit: 01f2a87c67392a86b533d0c76ba7666845d1945f
Parents: 6570e6e
Author: Nathan Gass <ga...@search.ch>
Authored: Fri Jan 13 15:54:07 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 15:54:07 2017 +0100

----------------------------------------------------------------------
 .../apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java  | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01f2a87c/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 303b7e320..df12fda 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.AttributeSource.State;
 
 /**
  * Tokenizes the given token into n-grams of given size(s).
@@ -43,7 +43,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
   private int curCodePointCount;
   private int curGramSize;
   private int savePosIncr;
-  private AttributeSource attributes;
+  private State state;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -81,15 +81,14 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
           curTermLength = termAtt.length();
           curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
           curGramSize = minGram;
-          attributes = input.cloneAttributes();
+          state = captureState();
           savePosIncr += posIncrAtt.getPositionIncrement();
         }
       }
       if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
         if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
           // grab gramSize chars from front or back
-          clearAttributes();
-          attributes.copyTo(this);
+	      restoreState(state);
           // first ngram gets increment, others don't
           if (curGramSize == minGram) {
             posIncrAtt.setPositionIncrement(savePosIncr);


[2/6] lucene-solr:master: copy all attributes including payload to new tokens

Posted by us...@apache.org.
copy all attributes including payload to new tokens


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6570e6ec
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6570e6ec
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6570e6ec

Branch: refs/heads/master
Commit: 6570e6ecc2b14a28da9873948083791ba47145d0
Parents: 61e4528
Author: Nathan Gass <ga...@search.ch>
Authored: Mon Jan 9 15:00:21 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 12:14:27 2017 +0100

----------------------------------------------------------------------
 .../lucene/analysis/ngram/EdgeNGramTokenFilter.java | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6570e6ec/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 827e26f..303b7e320 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -22,9 +22,8 @@ import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource;
 
 /**
  * Tokenizes the given token into n-grams of given size(s).
@@ -43,15 +42,11 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
   private int curTermLength;
   private int curCodePointCount;
   private int curGramSize;
-  private int tokStart;
-  private int tokEnd; // only used if the length changed before this filter
   private int savePosIncr;
-  private int savePosLen;
+  private AttributeSource attributes;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
 
   /**
    * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
@@ -86,17 +81,15 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
           curTermLength = termAtt.length();
           curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
           curGramSize = minGram;
-          tokStart = offsetAtt.startOffset();
-          tokEnd = offsetAtt.endOffset();
+          attributes = input.cloneAttributes();
           savePosIncr += posIncrAtt.getPositionIncrement();
-          savePosLen = posLenAtt.getPositionLength();
         }
       }
       if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
         if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
           // grab gramSize chars from front or back
           clearAttributes();
-          offsetAtt.setOffset(tokStart, tokEnd);
+          attributes.copyTo(this);
           // first ngram gets increment, others don't
           if (curGramSize == minGram) {
             posIncrAtt.setPositionIncrement(savePosIncr);
@@ -104,7 +97,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
           } else {
             posIncrAtt.setPositionIncrement(0);
           }
-          posLenAtt.setPositionLength(savePosLen);
           final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
           termAtt.copyBuffer(curTermBuffer, 0, charLength);
           curGramSize++;


[6/6] lucene-solr:master: LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes [merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]

Posted by us...@apache.org.
LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes
[merge branch 'edgepayloads' from Nathan Gass https://github.com/xabbu42/lucene-solr]

Signed-off-by: Uwe Schindler <us...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c64a0115
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c64a0115
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c64a0115

Branch: refs/heads/master
Commit: c64a01158e972176256e257d6c1d4629b05783a2
Parents: 9aa78dc ea049b9
Author: Uwe Schindler <us...@apache.org>
Authored: Mon Jan 16 11:16:43 2017 +0100
Committer: Uwe Schindler <us...@apache.org>
Committed: Mon Jan 16 11:16:43 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  7 +++
 .../analysis/ngram/EdgeNGramTokenFilter.java    | 17 ++-----
 .../lucene/analysis/ngram/NGramTokenFilter.java | 19 +++-----
 .../lucene/analysis/ngram/TestNGramFilters.java | 47 ++++++++++++++++++++
 4 files changed, 63 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c64a0115/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --cc lucene/CHANGES.txt
index 58201d6,4912920..530b0d4
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@@ -61,6 -58,6 +61,13 @@@ Othe
  
  * LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
  
++======================= Lucene 6.5.0 =======================
++
++Bug Fixes
++
++* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
++  and preserve all attributes. (Nathan Gass via Uwe Schindler)
++
  ======================= Lucene 6.4.0 =======================
  
  API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c64a0115/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --cc lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 827e26f,df12fda..47b80ff
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@@ -95,8 -88,7 +88,7 @@@ public final class EdgeNGramTokenFilte
        if (curGramSize <= maxGram) {         // if we have hit the end of our n-gram size range, quit
          if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
            // grab gramSize chars from front or back
-           clearAttributes();
-           offsetAtt.setOffset(tokStart, tokEnd);
 -	      restoreState(state);
++          restoreState(state);
            // first ngram gets increment, others don't
            if (curGramSize == minGram) {
              posIncrAtt.setPositionIncrement(savePosIncr);


[4/6] lucene-solr:master: add comment and test for ngram token filter

Posted by us...@apache.org.
add comment and test for ngram token filter


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/80e28542
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/80e28542
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/80e28542

Branch: refs/heads/master
Commit: 80e2854247cce485920b45acdeffa3e68bcea385
Parents: 01f2a87
Author: Nathan Gass <ga...@search.ch>
Authored: Fri Jan 13 16:42:41 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 16:42:41 2017 +0100

----------------------------------------------------------------------
 .../lucene/analysis/ngram/TestNGramFilters.java | 25 ++++++++++++++++++++
 1 file changed, 25 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/80e28542/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
index b6f4405..5de532f 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java
@@ -80,6 +80,28 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
   }
 
   /**
+   * Test NGramFilterFactory on tokens with payloads
+   */
+  public void testNGramFilterPayload() throws Exception {
+    Reader reader = new StringReader("test|0.1");
+    TokenStream stream = whitespaceMockTokenizer(reader);
+    stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
+    stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
+
+    stream.reset();
+    while (stream.incrementToken()) {
+      PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
+      assertNotNull(payAttr);
+      BytesRef payData = payAttr.getPayload();
+      assertNotNull(payData);
+      float payFloat = PayloadHelper.decodeFloat(payData.bytes);
+      assertEquals(0.1f, payFloat, 0.0f);
+    }
+    stream.end();
+    stream.close();
+  }
+
+  /**
    * Test EdgeNGramTokenizerFactory
    */
   public void testEdgeNGramTokenizer() throws Exception {
@@ -127,6 +149,9 @@ public class TestNGramFilters extends BaseTokenStreamFactoryTestCase {
         new String[] { "t", "te" });
   }
 
+  /**
+   * Test EdgeNGramFilterFactory on tokens with payloads
+   */
   public void testEdgeNGramFilterPayload() throws Exception {
     Reader reader = new StringReader("test|0.1");
     TokenStream stream = whitespaceMockTokenizer(reader);


[5/6] lucene-solr:master: also copy all attributes for ngram token filters

Posted by us...@apache.org.
also copy all attributes for ngram token filters


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ea049b96
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ea049b96
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ea049b96

Branch: refs/heads/master
Commit: ea049b96a24d6afc582ecdf406e8bf256b9911d9
Parents: 80e2854
Author: Nathan Gass <ga...@search.ch>
Authored: Fri Jan 13 17:01:34 2017 +0100
Committer: Nathan Gass <ga...@search.ch>
Committed: Fri Jan 13 17:07:23 2017 +0100

----------------------------------------------------------------------
 .../lucene/analysis/ngram/NGramTokenFilter.java  | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ea049b96/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index e275cfa..cb5d447 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -23,9 +23,8 @@ import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.util.AttributeSource.State;
 
 /**
  * Tokenizes the input into n-grams of the given size(s).
@@ -52,14 +51,11 @@ public final class NGramTokenFilter extends TokenFilter {
   private int curCodePointCount;
   private int curGramSize;
   private int curPos;
-  private int curPosInc, curPosLen;
-  private int tokStart;
-  private int tokEnd;
+  private int curPosInc;
+  private State state;
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final PositionIncrementAttribute posIncAtt;
-  private final PositionLengthAttribute posLenAtt;
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 
   /**
    * Creates NGramTokenFilter with given min and max n-grams.
@@ -79,7 +75,6 @@ public final class NGramTokenFilter extends TokenFilter {
     this.maxGram = maxGram;
 
     posIncAtt = addAttribute(PositionIncrementAttribute.class);
-    posLenAtt = addAttribute(PositionLengthAttribute.class);
   }
 
   /**
@@ -104,9 +99,7 @@ public final class NGramTokenFilter extends TokenFilter {
           curGramSize = minGram;
           curPos = 0;
           curPosInc = posIncAtt.getPositionIncrement();
-          curPosLen = posLenAtt.getPositionLength();
-          tokStart = offsetAtt.startOffset();
-          tokEnd = offsetAtt.endOffset();
+          state = captureState();
         }
       }
 
@@ -115,14 +108,12 @@ public final class NGramTokenFilter extends TokenFilter {
         curGramSize = minGram;
       }
       if ((curPos + curGramSize) <= curCodePointCount) {
-        clearAttributes();
+        restoreState(state);
         final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
         final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
         termAtt.copyBuffer(curTermBuffer, start, end - start);
         posIncAtt.setPositionIncrement(curPosInc);
         curPosInc = 0;
-        posLenAtt.setPositionLength(curPosLen);
-        offsetAtt.setOffset(tokStart, tokEnd);
         curGramSize++;
         return true;
       }