You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by si...@apache.org on 2009/11/22 22:09:44 UTC

svn commit: r883149 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java

Author: simonw
Date: Sun Nov 22 21:09:42 2009
New Revision: 883149

URL: http://svn.apache.org/viewvc?rev=883149&view=rev
Log:
LUCENE-2068: Fixed ReverseStringFilter for Unicode 4.0. Reverse Supplementary Characters correctly.

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=883149&r1=883148&r2=883149&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Nov 22 21:09:42 2009
@@ -2,6 +2,15 @@
 
 ======================= Trunk (not yet released) =======================
 
+Bug fixes
+
+ * LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
+   characters. During reverse the filter created unpaired surrogates, which
+   will be replaced by U+FFFD by the indexer, but not at query time. The filter
+   now reverses supplementary characters correctly if used with Version > 3.0.
+   (Simon Willnauer, Robert Muir)
+   
+
 ======================= Release 3.0.0 2009-11-25 =======================
 
 Changes in backwards compatibility policy

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java?rev=883149&r1=883148&r2=883149&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java Sun Nov 22 21:09:42 2009
@@ -20,6 +20,7 @@
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
 
 import java.io.IOException;
 
@@ -31,11 +32,19 @@
  * "\u0001yrtnuoc". This is useful when implementing efficient leading
  * wildcards search.
  * </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ReverseStringFilter, or when using any of
+ * its static methods:
+ * <ul>
+ *   <li> As of 3.1, supplementary characters are handled correctly
+ * </ul>
  */
 public final class ReverseStringFilter extends TokenFilter {
 
   private TermAttribute termAtt;
   private final char marker;
+  private final Version matchVersion;
   private static final char NOMARKER = '\uFFFF';
   
   /**
@@ -66,11 +75,13 @@
    * </p>
    * 
    * @param in {@link TokenStream} to filter
+   * @deprecated use {@link #ReverseStringFilter(Version, TokenStream)} 
+   *    instead. This constructor will be removed in Lucene 4.0
    */
   public ReverseStringFilter(TokenStream in) {
     this(in, NOMARKER);
   }
-
+  
   /**
    * Create a new ReverseStringFilter that reverses and marks all tokens in the
    * supplied {@link TokenStream}.
@@ -81,9 +92,42 @@
    * 
    * @param in {@link TokenStream} to filter
    * @param marker A character used to mark reversed tokens
+   * @deprecated use {@link #ReverseStringFilter(Version, TokenStream, char)} 
+   *    instead. This constructor will be removed in Lucene 4.0 
    */
   public ReverseStringFilter(TokenStream in, char marker) {
+    this(Version.LUCENE_30, in, marker);
+  }
+  
+  /**
+   * Create a new ReverseStringFilter that reverses all tokens in the 
+   * supplied {@link TokenStream}.
+   * <p>
+   * The reversed tokens will not be marked. 
+   * </p>
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param in {@link TokenStream} to filter
+   */
+  public ReverseStringFilter(Version matchVersion, TokenStream in) {
+    this(matchVersion, in, NOMARKER);
+  }
+
+  /**
+   * Create a new ReverseStringFilter that reverses and marks all tokens in the
+   * supplied {@link TokenStream}.
+   * <p>
+   * The reversed tokens will be prepended (marked) by the <code>marker</code>
+   * character.
+   * </p>
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param in {@link TokenStream} to filter
+   * @param marker A character used to mark reversed tokens
+   */
+  public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) {
     super(in);
+    this.matchVersion = matchVersion;
     this.marker = marker;
     termAtt = addAttribute(TermAttribute.class);
   }
@@ -97,7 +141,7 @@
         termAtt.resizeTermBuffer(len);
         termAtt.termBuffer()[len - 1] = marker;
       }
-      reverse( termAtt.termBuffer(), len );
+      reverse( matchVersion, termAtt.termBuffer(), 0, len );
       termAtt.setTermLength(len);
       return true;
     } else {
@@ -105,21 +149,94 @@
     }
   }
 
+  /**
+   * Reverses the given input string
+   * 
+   * @param input the string to reverse
+   * @return the given input string in reversed order
+   * @deprecated use {@link #reverse(Version, String)} instead. This method 
+   *    will be removed in Lucene 4.0
+   */
   public static String reverse( final String input ){
-    char[] charInput = input.toCharArray();
-    reverse( charInput );
+    return reverse(Version.LUCENE_30, input);
+  }
+  
+  /**
+   * Reverses the given input string
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param input the string to reverse
+   * @return the given input string in reversed order
+   */
+  public static String reverse( Version matchVersion, final String input ){
+    final char[] charInput = input.toCharArray();
+    reverse( matchVersion, charInput, 0, charInput.length );
     return new String( charInput );
   }
   
-  public static void reverse( char[] buffer ){
-    reverse( buffer, buffer.length );
+  /**
+   * Reverses the given input buffer in-place
+   * @param buffer the input char array to reverse
+   * @deprecated use {@link #reverse(Version, char[])} instead. This 
+   *    method will be removed in Lucene 4.0
+   */
+  public static void reverse( final char[] buffer ){
+    reverse( buffer, 0, buffer.length );
   }
   
-  public static void reverse( char[] buffer, int len ){
+  /**
+   * Reverses the given input buffer in-place
+   * @param matchVersion See <a href="#version">above</a>
+   * @param buffer the input char array to reverse
+   */
+  public static void reverse(Version matchVersion, final char[] buffer) {
+    reverse(matchVersion, buffer, 0, buffer.length);
+  }
+  
+  /**
+   * Partially reverses the given input buffer in-place from offset 0
+   * up to the given length.
+   * @param buffer the input char array to reverse
+   * @param len the length in the buffer up to where the
+   *        buffer should be reversed
+   * @deprecated use {@link #reverse(Version, char[], int)} instead. This 
+   *    method will be removed in Lucene 4.0
+   */
+  public static void reverse( final char[] buffer, final int len ){
     reverse( buffer, 0, len );
   }
   
-  public static void reverse( char[] buffer, int start, int len ){
+  /**
+   * Partially reverses the given input buffer in-place from offset 0
+   * up to the given length.
+   * @param matchVersion See <a href="#version">above</a>
+   * @param buffer the input char array to reverse
+   * @param len the length in the buffer up to where the
+   *        buffer should be reversed
+   */
+  public static void reverse(Version matchVersion, final char[] buffer,
+      final int len) {
+    reverse( matchVersion, buffer, 0, len );
+  }
+  
+  /**
+   * Partially reverses the given input buffer in-place from the given offset
+   * up to the given length.
+   * @param buffer the input char array to reverse
+   * @param start the offset from where to reverse the buffer
+   * @param len the length in the buffer up to where the
+   *        buffer should be reversed
+   * @deprecated use {@link #reverse(Version, char[], int, int)} instead. This 
+   *    method will be removed in Lucene 4.0
+   */
+  public static void reverse(char[] buffer, int start, int len ) {
+    reverseUnicode3(buffer, start, len);
+  }
+  
+  /**
+   * @deprecated Remove this when support for 3.0 indexes is no longer needed.
+   */
+  private static void reverseUnicode3( char[] buffer, int start, int len ){
     if( len <= 1 ) return;
     int num = len>>1;
     for( int i = start; i < ( start + num ); i++ ){
@@ -128,4 +245,77 @@
       buffer[start * 2 + len - i - 1] = c;
     }
   }
+  
+  /**
+   * Partially reverses the given input buffer in-place from the given offset
+   * up to the given length.
+   * @param matchVersion See <a href="#version">above</a>
+   * @param buffer the input char array to reverse
+   * @param start the offset from where to reverse the buffer
+   * @param len the length in the buffer up to where the
+   *        buffer should be reversed
+   */
+  public static void reverse(Version matchVersion, final char[] buffer,
+      final int start, final int len) {
+    if (!matchVersion.onOrAfter(Version.LUCENE_31)) {
+      reverseUnicode3(buffer, start, len);
+      return;
+    }
+    /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
+    if (len < 2)
+      return;
+    int end = (start + len) - 1;
+    char frontHigh = buffer[start];
+    char endLow = buffer[end];
+    boolean allowFrontSur = true, allowEndSur = true;
+    final int mid = start + (len >> 1);
+    for (int i = start; i < mid; ++i, --end) {
+      final char frontLow = buffer[i + 1];
+      final char endHigh = buffer[end - 1];
+      final boolean surAtFront = allowFrontSur
+          && Character.isSurrogatePair(frontHigh, frontLow);
+      if (surAtFront && (len < 3)) {
+        // nothing to do since surAtFront is allowed and 1 char left
+        return;
+      }
+      final boolean surAtEnd = allowEndSur
+          && Character.isSurrogatePair(endHigh, endLow);
+      allowFrontSur = allowEndSur = true;
+      if (surAtFront == surAtEnd) {
+        if (surAtFront) {
+          // both surrogates
+          buffer[end] = frontLow;
+          buffer[--end] = frontHigh;
+          buffer[i] = endHigh;
+          buffer[++i] = endLow;
+          frontHigh = buffer[i + 1];
+          endLow = buffer[end - 1];
+        } else {
+          // neither surrogates
+          buffer[end] = frontHigh;
+          buffer[i] = endLow;
+          frontHigh = frontLow;
+          endLow = endHigh;
+        }
+      } else {
+        if (surAtFront) {
+          // surrogate only at the front
+          buffer[end] = frontLow;
+          buffer[i] = endLow;
+          endLow = endHigh;
+          allowFrontSur = false;
+        } else {
+          // surrogate only at the end
+          buffer[end] = frontHigh;
+          buffer[i] = endHigh;
+          frontHigh = frontLow;
+          allowEndSur = false;
+        }
+      }
+    }
+    if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
+      // only if odd length
+      buffer[end] = allowFrontSur ? endLow : frontHigh;
+    }
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java?rev=883149&r1=883148&r2=883149&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java Sun Nov 22 21:09:42 2009
@@ -23,6 +23,7 @@
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
 
 public class TestReverseStringFilter extends BaseTokenStreamTestCase {
   public void testFilter() throws Exception {
@@ -73,4 +74,47 @@
     ReverseStringFilter.reverse( buffer, 2, 3 );
     assertEquals( "ABEDCF", new String( buffer ) );
   }
+  
+  /**
+   * Test the broken 3.0 behavior, for back compat
+   */
+  public void testBackCompat() throws Exception {
+    assertEquals("\uDF05\uD866\uDF05\uD866", ReverseStringFilter.reverse("𩬅𩬅"));
+  }
+  
+  public void testReverseSupplementary() throws Exception {
+    // supplementary at end
+    assertEquals("𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "瀛愯䇹鍟艱𩬅"));
+    // supplementary at end - 1
+    assertEquals("a𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "瀛愯䇹鍟艱𩬅a"));
+    // supplementary at start
+    assertEquals("fedcba𩬅", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "𩬅abcdef"));
+    // supplementary at start + 1
+    assertEquals("fedcba𩬅z", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "z𩬅abcdef"));
+    // supplementary medial
+    assertEquals("gfe𩬅dcba", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "abcd𩬅efg"));
+  }
+
+  public void testReverseSupplementaryChar() throws Exception {
+    // supplementary at end
+    char[] buffer = "abc瀛愯䇹鍟艱𩬅".toCharArray();
+    ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
+    assertEquals("abc𩬅艱鍟䇹愯瀛", new String(buffer));
+    // supplementary at end - 1
+    buffer = "abc瀛愯䇹鍟艱𩬅d".toCharArray();
+    ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
+    assertEquals("abcd𩬅艱鍟䇹愯瀛", new String(buffer));
+    // supplementary at start
+    buffer = "abc𩬅瀛愯䇹鍟艱".toCharArray();
+    ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
+    assertEquals("abc艱鍟䇹愯瀛𩬅", new String(buffer));
+    // supplementary at start + 1
+    buffer = "abcd𩬅瀛愯䇹鍟艱".toCharArray();
+    ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
+    assertEquals("abc艱鍟䇹愯瀛𩬅d", new String(buffer));
+    // supplementary medial
+    buffer = "abc瀛愯𩬅def".toCharArray();
+    ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
+    assertEquals("abcfed𩬅愯瀛", new String(buffer));
+  }
 }