You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/08/17 20:14:45 UTC

svn commit: r1374381 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/charfilter/ analysis/common/src/test/org/apache/lucene/analysis/charfilter/ core/src/java/org/apache/lucene/util/fst/

Author: mikemccand
Date: Fri Aug 17 18:14:44 2012
New Revision: 1374381

URL: http://svn.apache.org/viewvc?rev=1374381&view=rev
Log:
LUCENE-4310: non-BMP characters were failing to match with MappingCharFilter

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1374381&r1=1374380&r2=1374381&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Aug 17 18:14:44 2012
@@ -79,6 +79,10 @@ Bug Fixes
   Also, ensure immutability and use only one instance of this table in RAM (lazy
   loaded) since its quite large. (sausarkar, Steven Rowe, Robert Muir)
 
+* LUCENE-4310: MappingCharFilter was failing to match input strings
+  containing non-BMP Unicode characters.  (Dawid Weiss, Robert Muir,
+  Mike McCandless)
+
 Build
 
 * LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for 

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java?rev=1374381&r1=1374380&r2=1374381&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java Fri Aug 17 18:14:44 2012
@@ -111,9 +111,8 @@ public class NormalizeCharMap {
         final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
         final IntsRef scratch = new IntsRef();
         for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
-          builder.add(Util.toUTF32(ent.getKey(), scratch),
+          builder.add(Util.toUTF16(ent.getKey(), scratch),
                       new CharsRef(ent.getValue()));
-      
         }
         map = builder.finish();
         pendingPairs.clear();

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java?rev=1374381&r1=1374380&r2=1374381&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java Fri Aug 17 18:14:44 2012
@@ -33,6 +33,7 @@ import org.apache.lucene.analysis.CharFi
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 
 public class TestMappingCharFilter extends BaseTokenStreamTestCase {
@@ -55,6 +56,11 @@ public class TestMappingCharFilter exten
 
     builder.add( "empty", "" );
 
+    // BMP (surrogate pair):
+    builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
+
+    builder.add("\uff01", "full-width-exclamation");
+
     normMap = builder.build();
   }
 
@@ -128,6 +134,18 @@ public class TestMappingCharFilter exten
     assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
   }
 
+  public void testNonBMPChar() throws Exception {
+    CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
+  }
+
+  public void testFullWidthChar() throws Exception {
+    CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
+  }
+
   //
   //                1111111111222
   //      01234567890123456789012

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1374381&r1=1374380&r2=1374381&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Fri Aug 17 18:14:44 2012
@@ -767,6 +767,21 @@ public final class Util {
     }
   }
 
+  /** Just maps each UTF16 unit (char) to the ints in an
+   *  IntsRef. */
+  public static IntsRef toUTF16(CharSequence s, IntsRef scratch) {
+    final int charLimit = s.length();
+    scratch.grow(charLimit);
+    int idx = 0;
+    while(idx < charLimit) {
+      scratch.ints[idx] = (int) s.charAt(idx);
+      idx++;
+    }
+    scratch.offset = 0;
+    scratch.length = idx;
+    return scratch;
+  }
+
   /** Decodes the Unicode codepoints from the provided
    *  CharSequence and places them in the provided scratch
    *  IntsRef, which must not be null, returning it. */