You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/10/06 08:57:38 UTC
[1/2] lucene-solr:master: LUCENE-7468: ASCIIFoldingFilter should not
emit duplicated tokens when preserve original is on.
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 59d83f57e -> 739c0a7bf
refs/heads/master 36b3b0884 -> 28d187acd
LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/28d187ac
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/28d187ac
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/28d187ac
Branch: refs/heads/master
Commit: 28d187acd1e391723eb6e1b5445f22abf5580a80
Parents: 36b3b08
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Oct 6 10:56:43 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Oct 6 10:56:43 2016 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
.../miscellaneous/ASCIIFoldingFilter.java | 24 +++++++++++++++++---
.../miscellaneous/TestASCIIFoldingFilter.java | 13 +++++++++++
3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28d187ac/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a1273d7..4437792 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -58,6 +58,9 @@ Bug Fixes
* LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
merge method (Julien MASSENET via Mike McCandless)
+* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
+ preserve original is on. (David Causse via Adrien Grand)
+
Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28d187ac/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
index a327d17..686c7a6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
@@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
*/
public void foldToASCII(char[] input, int length)
{
- if (preserveOriginal) {
- state = captureState();
- }
// Worst-case length required:
final int maxSizeNeeded = 4 * length;
if (output.length < maxSizeNeeded) {
@@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
}
outputPos = foldToASCII(input, 0, output, 0, length);
+ if (preserveOriginal && needToPreserve(input, length)) {
+ state = captureState();
+ }
+ }
+
+ /**
+ * Check if foldToASCII generated a different token.
+ * @param input original term
+ * @param inputLength length of the original term
+ * @return true if foldToASCII generated a different token
+ */
+ private boolean needToPreserve(char[] input, int inputLength) {
+ if(inputLength != outputPos) {
+ return true;
+ }
+ for(int i = 0; i < inputLength; i++) {
+ if(input[i] != output[i]) {
+ return true;
+ }
+ }
+ return false;
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/28d187ac/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
index 5225aaa..0e6e4fb 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
@@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
assertFalse(filter.incrementToken());
}
+ // Test that we do not emit duplicated tokens when preserve original is on
+ public void testUnmodifiedLetters() throws Exception {
+ TokenStream stream = whitespaceMockTokenizer("� � � END");
+ ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
+
+ CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ filter.reset();
+ assertNextTerms("�", "�", filter, termAtt);
+ assertNextTerms("�", "�", filter, termAtt);
+ assertNextTerms("�", "�", filter, termAtt);
+ assertNextTerms("END", "END", filter, termAtt);
+ assertFalse(filter.incrementToken());
+ }
// The following Perl script generated the foldings[] array automatically
// from ASCIIFoldingFilter.java:
[2/2] lucene-solr:branch_6x: LUCENE-7468: ASCIIFoldingFilter should
not emit duplicated tokens when preserve original is on.
Posted by jp...@apache.org.
LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when preserve original is on.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/739c0a7b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/739c0a7b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/739c0a7b
Branch: refs/heads/branch_6x
Commit: 739c0a7bf2c911e25ed40fb6717d9aed641a0a2f
Parents: 59d83f5
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Oct 6 10:56:43 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Oct 6 10:57:10 2016 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
.../miscellaneous/ASCIIFoldingFilter.java | 24 +++++++++++++++++---
.../miscellaneous/TestASCIIFoldingFilter.java | 13 +++++++++++
3 files changed, 37 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/739c0a7b/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4ebd619..591e3d2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,6 +23,9 @@ Bug Fixes
* LUCENE-7456: PerFieldPostings/DocValues was failing to delegate the
merge method (Julien MASSENET via Mike McCandless)
+* LUCENE-7468: ASCIIFoldingFilter should not emit duplicated tokens when
+ preserve original is on. (David Causse via Adrien Grand)
+
Improvements
* LUCENE-7439: FuzzyQuery now matches all terms within the specified
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/739c0a7b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
index a327d17..686c7a6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.java
@@ -134,9 +134,6 @@ public final class ASCIIFoldingFilter extends TokenFilter {
*/
public void foldToASCII(char[] input, int length)
{
- if (preserveOriginal) {
- state = captureState();
- }
// Worst-case length required:
final int maxSizeNeeded = 4 * length;
if (output.length < maxSizeNeeded) {
@@ -144,6 +141,27 @@ public final class ASCIIFoldingFilter extends TokenFilter {
}
outputPos = foldToASCII(input, 0, output, 0, length);
+ if (preserveOriginal && needToPreserve(input, length)) {
+ state = captureState();
+ }
+ }
+
+ /**
+ * Check if foldToASCII generated a different token.
+ * @param input original term
+ * @param inputLength length of the original term
+ * @return true if foldToASCII generated a different token
+ */
+ private boolean needToPreserve(char[] input, int inputLength) {
+ if(inputLength != outputPos) {
+ return true;
+ }
+ for(int i = 0; i < inputLength; i++) {
+ if(input[i] != output[i]) {
+ return true;
+ }
+ }
+ return false;
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/739c0a7b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
index 5225aaa..0e6e4fb 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestASCIIFoldingFilter.java
@@ -131,6 +131,19 @@ public class TestASCIIFoldingFilter extends BaseTokenStreamTestCase {
assertFalse(filter.incrementToken());
}
+ // Test that we do not emit duplicated tokens when preserve original is on
+ public void testUnmodifiedLetters() throws Exception {
+ TokenStream stream = whitespaceMockTokenizer("� � � END");
+ ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
+
+ CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ filter.reset();
+ assertNextTerms("�", "�", filter, termAtt);
+ assertNextTerms("�", "�", filter, termAtt);
+ assertNextTerms("�", "�", filter, termAtt);
+ assertNextTerms("END", "END", filter, termAtt);
+ assertFalse(filter.incrementToken());
+ }
// The following Perl script generated the foldings[] array automatically
// from ASCIIFoldingFilter.java: