You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/02/08 10:22:13 UTC
[2/3] lucene-solr:branch_5_5: LUCENE-6974: Fixed DecimalDigitFilter
in case of supplementary code points.
LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/91147a84
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/91147a84
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/91147a84
Branch: refs/heads/branch_5_5
Commit: 91147a84515e0d84201a36a0aedaab40806a02a0
Parents: 28d405c
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Oct 18 10:38:51 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Feb 8 10:59:12 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../analysis/core/DecimalDigitFilter.java | 2 +-
.../analysis/core/TestDecimalDigitFilter.java | 151 +++++++++++++++++--
3 files changed, 140 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index af76924..41eea64 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -24,6 +24,9 @@ Bug Fixes
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the
dictionary file it opened (Markus via Mike McCandless)
+* LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points.
+ (Hossman)
+
Other
* LUCENE-6989: Backport MMapDirectory's unmapping code from Lucene 6.4 to use
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
index b81d42f..de459cf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
@@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter {
buffer[i] = (char) ('0' + Character.getNumericValue(ch));
// if the original was supplementary, shrink the string
if (ch > 0xFFFF) {
- length = StemmerUtil.delete(buffer, ++i, length);
+ length = StemmerUtil.delete(buffer, i+1, length);
termAtt.setLength(length);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
index ae25193..e5e18ef 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
@@ -21,14 +21,42 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
import org.apache.lucene.util.TestUtil;
+import java.util.Random;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
/**
* Tests for {@link DecimalDigitFilter}
*/
public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
private Analyzer tokenized;
private Analyzer keyword;
+
+ private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS;
+
+ @BeforeClass
+ public static void init_DECIMAL_DIGIT_CODEPOINTS() {
+ DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT);
+ for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
+ if (Character.isDigit(codepoint)) {
+ DECIMAL_DIGIT_CODEPOINTS.set(codepoint);
+ }
+ }
+ assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality();
+ }
+
+ @AfterClass
+ public static void destroy_DECIMAL_DIGIT_CODEPOINTS() {
+ DECIMAL_DIGIT_CODEPOINTS = null;
+ }
+
@Override
public void setUp() throws Exception {
@@ -64,30 +92,83 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
}
/**
- * test all digits in different locations of strings.
+ * test that double struck digits are normalized
*/
- public void testRandom() throws Exception {
- for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
- if (Character.isDigit(codepoint)) {
- // add some a-z before/after the string
- String prefix = TestUtil.randomSimpleString(random());
- String suffix = TestUtil.randomSimpleString(random());
+ public void testDoubleStruck() throws Exception {
+ // MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4
+ final String input = "\U0001d7d9 \U0001d7e1 \U0001d7e0 \U0001d7dc";
+ final String expected = "1 9 8 4";
+ checkOneTerm(keyword, input, expected);
+ checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s",""));
+ }
+
+ /**
+ * test sequences of digits mixed with other random simple string data
+ */
+ public void testRandomSequences() throws Exception {
+
+ // test numIters random strings containing a sequence of numDigits codepoints
+ final int numIters = atLeast(5);
+ for (int iter = 0; iter < numIters; iter++) {
+ final int numDigits = atLeast(20);
+ final StringBuilder expected = new StringBuilder();
+ final StringBuilder actual = new StringBuilder();
+ for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) {
- StringBuilder expected = new StringBuilder();
+ // increased odds of 0 length random string prefix
+ final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
expected.append(prefix);
+ actual.append(prefix);
+
+ int codepoint = getRandomDecimalDigit(random());
+
int value = Character.getNumericValue(codepoint);
assert value >= 0 && value <= 9;
expected.append(Integer.toString(value));
- expected.append(suffix);
-
- StringBuilder actual = new StringBuilder();
- actual.append(prefix);
actual.appendCodePoint(codepoint);
- actual.append(suffix);
-
- checkOneTerm(keyword, actual.toString(), expected.toString());
}
+ // occasional suffix, increased odds of 0 length random string
+ final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
+ expected.append(suffix);
+ actual.append(suffix);
+
+ checkOneTerm(keyword, actual.toString(), expected.toString());
}
+
+ }
+
+ /**
+ * test each individual digit in different locations of strings.
+ */
+ public void testRandom() throws Exception {
+ int numCodePointsChecked = 0; // sanity check
+ for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0);
+ codepoint != DocIdSetIterator.NO_MORE_DOCS;
+ codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) {
+
+ assert Character.isDigit(codepoint);
+
+ // add some a-z before/after the string
+ String prefix = TestUtil.randomSimpleString(random());
+ String suffix = TestUtil.randomSimpleString(random());
+
+ StringBuilder expected = new StringBuilder();
+ expected.append(prefix);
+ int value = Character.getNumericValue(codepoint);
+ assert value >= 0 && value <= 9;
+ expected.append(Integer.toString(value));
+ expected.append(suffix);
+
+ StringBuilder actual = new StringBuilder();
+ actual.append(prefix);
+ actual.appendCodePoint(codepoint);
+ actual.append(suffix);
+
+ checkOneTerm(keyword, actual.toString(), expected.toString());
+
+ numCodePointsChecked++;
+ }
+ assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked;
}
/**
@@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
public void testRandomStrings() throws Exception {
checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
}
+
+ /** returns a psuedo-random codepoint which is a Decimal Digit */
+ public static int getRandomDecimalDigit(Random r) {
+ final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1);
+
+ if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess
+ assert Character.isDigit(aprox);
+ return aprox;
+ }
+
+ // seek up and down for closest set bit
+ final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox);
+ final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox);
+
+ // sanity check edge cases
+ if (lower < 0) {
+ assert higher != DocIdSetIterator.NO_MORE_DOCS;
+ assert Character.isDigit(higher);
+ return higher;
+ }
+ if (higher == DocIdSetIterator.NO_MORE_DOCS) {
+ assert 0 <= lower;
+ assert Character.isDigit(lower);
+ return lower;
+ }
+
+ // which is closer?
+ final int cmp = Integer.compare(aprox - lower, higher - aprox);
+
+ if (0 == cmp) {
+ // dead even, flip a coin
+ final int result = random().nextBoolean() ? lower : higher;
+ assert Character.isDigit(result);
+ return result;
+ }
+
+ final int result = (cmp < 0) ? lower : higher;
+ assert Character.isDigit(result);
+ return result;
+ }
}