You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/02/08 10:22:13 UTC

[2/3] lucene-solr:branch_5_5: LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points.

LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/91147a84
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/91147a84
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/91147a84

Branch: refs/heads/branch_5_5
Commit: 91147a84515e0d84201a36a0aedaab40806a02a0
Parents: 28d405c
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Oct 18 10:38:51 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Feb 8 10:59:12 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   3 +
 .../analysis/core/DecimalDigitFilter.java       |   2 +-
 .../analysis/core/TestDecimalDigitFilter.java   | 151 +++++++++++++++++--
 3 files changed, 140 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index af76924..41eea64 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -24,6 +24,9 @@ Bug Fixes
 * LUCENE-7547: JapaneseTokenizerFactory was failing to close the
   dictionary file it opened (Markus via Mike McCandless)
 
+* LUCENE-6974: Fixed DecimalDigitFilter in case of supplementary code points.
+  (Hossman)
+
 Other
 
 * LUCENE-6989: Backport MMapDirectory's unmapping code from Lucene 6.4 to use

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
index b81d42f..de459cf 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/DecimalDigitFilter.java
@@ -52,7 +52,7 @@ public final class DecimalDigitFilter extends TokenFilter {
           buffer[i] = (char) ('0' + Character.getNumericValue(ch));
           // if the original was supplementary, shrink the string
           if (ch > 0xFFFF) {
-            length = StemmerUtil.delete(buffer, ++i, length);
+            length = StemmerUtil.delete(buffer, i+1, length);
             termAtt.setLength(length);
           }
         }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/91147a84/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
index ae25193..e5e18ef 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestDecimalDigitFilter.java
@@ -21,14 +21,42 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
 import org.apache.lucene.util.TestUtil;
 
+import java.util.Random;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
 /**
  * Tests for {@link DecimalDigitFilter}
  */
 public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
   private Analyzer tokenized;
   private Analyzer keyword;
+
+  private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS;
+
+  @BeforeClass
+  public static void init_DECIMAL_DIGIT_CODEPOINTS() {
+    DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT);
+    for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
+      if (Character.isDigit(codepoint)) {
+        DECIMAL_DIGIT_CODEPOINTS.set(codepoint);
+      }
+    }
+    assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality();
+  }
+  
+  @AfterClass
+  public static void destroy_DECIMAL_DIGIT_CODEPOINTS() {
+    DECIMAL_DIGIT_CODEPOINTS = null;
+  }
+
   
   @Override
   public void setUp() throws Exception {
@@ -64,30 +92,83 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
   }
   
   /**
-   * test all digits in different locations of strings.
+   * test that double struck digits are normalized
    */
-  public void testRandom() throws Exception {
-    for (int codepoint = Character.MIN_CODE_POINT; codepoint < Character.MAX_CODE_POINT; codepoint++) {
-      if (Character.isDigit(codepoint)) {
-        // add some a-z before/after the string
-        String prefix = TestUtil.randomSimpleString(random());
-        String suffix = TestUtil.randomSimpleString(random());
+  public void testDoubleStruck() throws Exception {
+    // MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4
+    final String input = "\U0001d7d9 \U0001d7e1 \U0001d7e0 \U0001d7dc";
+    final String expected = "1 9 8 4";
+    checkOneTerm(keyword, input, expected);
+    checkOneTerm(keyword, input.replaceAll("\\s",""), expected.replaceAll("\\s",""));
+  }
+
+  /**
+   * test sequences of digits mixed with other random simple string data
+   */
+  public void testRandomSequences() throws Exception {
+    
+    // test numIters random strings containing a sequence of numDigits codepoints
+    final int numIters = atLeast(5);
+    for (int iter = 0; iter < numIters; iter++) {
+      final int numDigits = atLeast(20);
+      final StringBuilder expected = new StringBuilder();
+      final StringBuilder actual = new StringBuilder();
+      for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) {
         
-        StringBuilder expected = new StringBuilder();
+        // increased odds of 0 length random string prefix
+        final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
         expected.append(prefix);
+        actual.append(prefix);
+        
+        int codepoint = getRandomDecimalDigit(random());
+
         int value = Character.getNumericValue(codepoint);
         assert value >= 0 && value <= 9;
         expected.append(Integer.toString(value));
-        expected.append(suffix);
-        
-        StringBuilder actual = new StringBuilder();
-        actual.append(prefix);
         actual.appendCodePoint(codepoint);
-        actual.append(suffix);
-        
-        checkOneTerm(keyword, actual.toString(), expected.toString());
       }
+      // occasional suffix, increased odds of 0 length random string
+      final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random());
+      expected.append(suffix);
+      actual.append(suffix);
+      
+      checkOneTerm(keyword, actual.toString(), expected.toString());
     }
+
+  }
+  
+  /**
+   * test each individual digit in different locations of strings.
+   */
+  public void testRandom() throws Exception {
+    int numCodePointsChecked = 0; // sanity check
+    for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0);
+         codepoint != DocIdSetIterator.NO_MORE_DOCS;
+         codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint+1)) {
+      
+      assert Character.isDigit(codepoint);
+      
+      // add some a-z before/after the string
+      String prefix = TestUtil.randomSimpleString(random());
+      String suffix = TestUtil.randomSimpleString(random());
+      
+      StringBuilder expected = new StringBuilder();
+      expected.append(prefix);
+      int value = Character.getNumericValue(codepoint);
+      assert value >= 0 && value <= 9;
+      expected.append(Integer.toString(value));
+      expected.append(suffix);
+      
+      StringBuilder actual = new StringBuilder();
+      actual.append(prefix);
+      actual.appendCodePoint(codepoint);
+      actual.append(suffix);
+      
+      checkOneTerm(keyword, actual.toString(), expected.toString());
+      
+      numCodePointsChecked++;
+    }
+    assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked;
   }
   
   /**
@@ -103,4 +184,44 @@ public class TestDecimalDigitFilter extends BaseTokenStreamTestCase {
   public void testRandomStrings() throws Exception {
     checkRandomData(random(), tokenized, 1000*RANDOM_MULTIPLIER);
   }
+
+  /** returns a psuedo-random codepoint which is a Decimal Digit */
+  public static int getRandomDecimalDigit(Random r) {
+    final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length()-1);
+    
+    if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess
+      assert Character.isDigit(aprox);
+      return aprox;
+    }
+    
+    // seek up and down for closest set bit
+    final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox);
+    final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox);
+    
+    // sanity check edge cases
+    if (lower < 0) {
+      assert higher != DocIdSetIterator.NO_MORE_DOCS;
+      assert Character.isDigit(higher);
+      return higher;
+    }
+    if (higher == DocIdSetIterator.NO_MORE_DOCS) {
+      assert 0 <= lower;
+      assert Character.isDigit(lower);
+      return lower;
+    }
+    
+    // which is closer?
+    final int cmp = Integer.compare(aprox - lower, higher - aprox);
+    
+    if (0 == cmp) {
+      // dead even, flip a coin
+      final int result = random().nextBoolean() ? lower : higher;
+      assert Character.isDigit(result);
+      return result;
+    }
+    
+    final int result = (cmp < 0) ? lower : higher;
+    assert Character.isDigit(result);
+    return result;
+  }
 }