You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/10/06 08:57:38 UTC

[1/2] lucene-solr:master: LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 59d83f57e -> 739c0a7bf
  refs/heads/master 36b3b0884 -> 28d187acd


LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/28d187ac
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/28d187ac
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/28d187ac

Branch: refs/heads/master
Commit: 28d187acd1e391723eb6e1b5445f22abf5580a80
Parents: 36b3b08
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Oct 6 10:56:43 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Oct 6 10:56:43 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +++
 .../miscellaneous/ASCIIFoldingFilter.java       | 24 +++++++++++++++++---
 .../miscellaneous/TestASCIIFoldingFilter.java   | 13 +++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28d187ac/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a1273d7..4437792 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -58,6 +58,9 @@ Bug Fixes
 * LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
   merge method (Julien MASSENET via Mike McCandless)
 
+* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
+  preserve original is on. (David Causse via Adrien Grand)
+
 Improvements
 
 * LUCENE-7439: FuzzyQuery now matches all terms within the specified

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28d187ac/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
index a327d17..686c7a6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
@@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
    */
   public void foldToASCII(char[] input, int length)
   {
-    if (preserveOriginal) {
-      state = captureState();
-    }
     // Worst-case length required:
     final int maxSizeNeeded = 4 * length;
     if (output.length < maxSizeNeeded) {
@@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
     }
 
     outputPos = foldToASCII(input, 0, output, 0, length);
+    if (preserveOriginal && needToPreserve(input, length)) {
+      state = captureState();
+    }
+  }
+
+  /**
+   * Check if foldToASCII generated a different token.
+   * @param input original term
+   * @param inputLength length of the original term
+   * @return true if foldToASCII generated a different token
+   */
+  private boolean needToPreserve(char[] input, int inputLength) {
+    if(inputLength != outputPos) {
+      return true;
+    }
+    for(int i = 0; i < inputLength; i++) {
+      if(input[i] != output[i]) {
+        return true;
+      }
+    }
+    return false;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28d187ac/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
index 5225aaa..0e6e4fb 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
@@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
     assertFalse(filter.incrementToken());
   }
 
+  // Test that we do not emit duplicated tokens when preserve original is on
+  public void testUnmodifiedLetters() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("� � � END");
+    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
+
+    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    filter.reset();
+    assertNextTerms("�", "�", filter, termAtt);
+    assertNextTerms("�", "�", filter, termAtt);
+    assertNextTerms("�", "�", filter, termAtt);
+    assertNextTerms("END", "END", filter, termAtt);
+    assertFalse(filter.incrementToken());
+  }
 
   // The following Perl script generated the foldings[] array automatically
   // from ASCIIFoldingFilter.java:


[2/2] lucene-solr:branch_6x: LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.

Posted by jp...@apache.org.
LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/739c0a7b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/739c0a7b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/739c0a7b

Branch: refs/heads/branch_6x
Commit: 739c0a7bf2c911e25ed40fb6717d9aed641a0a2f
Parents: 59d83f5
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Oct 6 10:56:43 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Oct 6 10:57:10 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +++
 .../miscellaneous/ASCIIFoldingFilter.java       | 24 +++++++++++++++++---
 .../miscellaneous/TestASCIIFoldingFilter.java   | 13 +++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/739c0a7b/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4ebd619..591e3d2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,6 +23,9 @@ Bug Fixes
 * LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
   merge method (Julien MASSENET via Mike McCandless)
 
+* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
+  preserve original is on. (David Causse via Adrien Grand)
+
 Improvements
 
 * LUCENE-7439: FuzzyQuery now matches all terms within the specified

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/739c0a7b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
index a327d17..686c7a6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
@@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
    */
   public void foldToASCII(char[] input, int length)
   {
-    if (preserveOriginal) {
-      state = captureState();
-    }
     // Worst-case length required:
     final int maxSizeNeeded = 4 * length;
     if (output.length < maxSizeNeeded) {
@@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
     }
 
     outputPos = foldToASCII(input, 0, output, 0, length);
+    if (preserveOriginal && needToPreserve(input, length)) {
+      state = captureState();
+    }
+  }
+
+  /**
+   * Check if foldToASCII generated a different token.
+   * @param input original term
+   * @param inputLength length of the original term
+   * @return true if foldToASCII generated a different token
+   */
+  private boolean needToPreserve(char[] input, int inputLength) {
+    if(inputLength != outputPos) {
+      return true;
+    }
+    for(int i = 0; i < inputLength; i++) {
+      if(input[i] != output[i]) {
+        return true;
+      }
+    }
+    return false;
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/739c0a7b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
index 5225aaa..0e6e4fb 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
@@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
     assertFalse(filter.incrementToken());
   }
 
+  // Test that we do not emit duplicated tokens when preserve original is on
+  public void testUnmodifiedLetters() throws Exception {
+    TokenStream stream = whitespaceMockTokenizer("� � � END");
+    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
+
+    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+    filter.reset();
+    assertNextTerms("�", "�", filter, termAtt);
+    assertNextTerms("�", "�", filter, termAtt);
+    assertNextTerms("�", "�", filter, termAtt);
+    assertNextTerms("END", "END", filter, termAtt);
+    assertFalse(filter.incrementToken());
+  }
 
   // The following Perl script generated the foldings[] array automatically
   // from ASCIIFoldingFilter.java: