You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by si...@apache.org on 2009/11/22 22:09:44 UTC
svn commit: r883149 - in /lucene/java/trunk/contrib: CHANGES.txt
analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
Author: simonw
Date: Sun Nov 22 21:09:42 2009
New Revision: 883149
URL: http://svn.apache.org/viewvc?rev=883149&view=rev
Log:
LUCENE-2068: Fixed ReverseStringFilter for Unicode 4.0. Reverse Supplementary Characters correctly.
Modified:
lucene/java/trunk/contrib/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=883149&r1=883148&r2=883149&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Nov 22 21:09:42 2009
@@ -2,6 +2,15 @@
======================= Trunk (not yet released) =======================
+Bug fixes
+
+ * LUCENE-2068: Fixed ReverseStringFilter which was not aware of supplementary
+ characters. During reverse the filter created unpaired surrogates, which
+ will be replaced by U+FFFD by the indexer, but not at query time. The filter
+ now reverses supplementary characters correctly if used with Version > 3.0.
+ (Simon Willnauer, Robert Muir)
+
+
======================= Release 3.0.0 2009-11-25 =======================
Changes in backwards compatibility policy
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java?rev=883149&r1=883148&r2=883149&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilter.java Sun Nov 22 21:09:42 2009
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
@@ -31,11 +32,19 @@
* "\u0001yrtnuoc". This is useful when implementing efficient leading
* wildcards search.
* </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ReverseStringFilter, or when using any of
+ * its static methods:
+ * <ul>
+ * <li> As of 3.1, supplementary characters are handled correctly
+ * </ul>
*/
public final class ReverseStringFilter extends TokenFilter {
private TermAttribute termAtt;
private final char marker;
+ private final Version matchVersion;
private static final char NOMARKER = '\uFFFF';
/**
@@ -66,11 +75,13 @@
* </p>
*
* @param in {@link TokenStream} to filter
+ * @deprecated use {@link #ReverseStringFilter(Version, TokenStream)}
+ * instead. This constructor will be removed in Lucene 4.0
*/
public ReverseStringFilter(TokenStream in) {
this(in, NOMARKER);
}
-
+
/**
* Create a new ReverseStringFilter that reverses and marks all tokens in the
* supplied {@link TokenStream}.
@@ -81,9 +92,42 @@
*
* @param in {@link TokenStream} to filter
* @param marker A character used to mark reversed tokens
+ * @deprecated use {@link #ReverseStringFilter(Version, TokenStream, char)}
+ * instead. This constructor will be removed in Lucene 4.0
*/
public ReverseStringFilter(TokenStream in, char marker) {
+ this(Version.LUCENE_30, in, marker);
+ }
+
+ /**
+ * Create a new ReverseStringFilter that reverses all tokens in the
+ * supplied {@link TokenStream}.
+ * <p>
+ * The reversed tokens will not be marked.
+ * </p>
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param in {@link TokenStream} to filter
+ */
+ public ReverseStringFilter(Version matchVersion, TokenStream in) {
+ this(matchVersion, in, NOMARKER);
+ }
+
+ /**
+ * Create a new ReverseStringFilter that reverses and marks all tokens in the
+ * supplied {@link TokenStream}.
+ * <p>
+ * The reversed tokens will be prepended (marked) by the <code>marker</code>
+ * character.
+ * </p>
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param in {@link TokenStream} to filter
+ * @param marker A character used to mark reversed tokens
+ */
+ public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) {
super(in);
+ this.matchVersion = matchVersion;
this.marker = marker;
termAtt = addAttribute(TermAttribute.class);
}
@@ -97,7 +141,7 @@
termAtt.resizeTermBuffer(len);
termAtt.termBuffer()[len - 1] = marker;
}
- reverse( termAtt.termBuffer(), len );
+ reverse( matchVersion, termAtt.termBuffer(), 0, len );
termAtt.setTermLength(len);
return true;
} else {
@@ -105,21 +149,94 @@
}
}
+ /**
+ * Reverses the given input string
+ *
+ * @param input the string to reverse
+ * @return the given input string in reversed order
+ * @deprecated use {@link #reverse(Version, String)} instead. This method
+ * will be removed in Lucene 4.0
+ */
public static String reverse( final String input ){
- char[] charInput = input.toCharArray();
- reverse( charInput );
+ return reverse(Version.LUCENE_30, input);
+ }
+
+ /**
+ * Reverses the given input string
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param input the string to reverse
+ * @return the given input string in reversed order
+ */
+ public static String reverse( Version matchVersion, final String input ){
+ final char[] charInput = input.toCharArray();
+ reverse( matchVersion, charInput, 0, charInput.length );
return new String( charInput );
}
- public static void reverse( char[] buffer ){
- reverse( buffer, buffer.length );
+ /**
+ * Reverses the given input buffer in-place
+ * @param buffer the input char array to reverse
+ * @deprecated use {@link #reverse(Version, char[])} instead. This
+ * method will be removed in Lucene 4.0
+ */
+ public static void reverse( final char[] buffer ){
+ reverse( buffer, 0, buffer.length );
}
- public static void reverse( char[] buffer, int len ){
+ /**
+ * Reverses the given input buffer in-place
+ * @param matchVersion See <a href="#version">above</a>
+ * @param buffer the input char array to reverse
+ */
+ public static void reverse(Version matchVersion, final char[] buffer) {
+ reverse(matchVersion, buffer, 0, buffer.length);
+ }
+
+ /**
+ * Partially reverses the given input buffer in-place from offset 0
+ * up to the given length.
+ * @param buffer the input char array to reverse
+ * @param len the length in the buffer up to where the
+ * buffer should be reversed
+ * @deprecated use {@link #reverse(Version, char[], int)} instead. This
+ * method will be removed in Lucene 4.0
+ */
+ public static void reverse( final char[] buffer, final int len ){
reverse( buffer, 0, len );
}
- public static void reverse( char[] buffer, int start, int len ){
+ /**
+ * Partially reverses the given input buffer in-place from offset 0
+ * up to the given length.
+ * @param matchVersion See <a href="#version">above</a>
+ * @param buffer the input char array to reverse
+ * @param len the length in the buffer up to where the
+ * buffer should be reversed
+ */
+ public static void reverse(Version matchVersion, final char[] buffer,
+ final int len) {
+ reverse( matchVersion, buffer, 0, len );
+ }
+
+ /**
+ * Partially reverses the given input buffer in-place from the given offset
+ * up to the given length.
+ * @param buffer the input char array to reverse
+ * @param start the offset from where to reverse the buffer
+ * @param len the length in the buffer up to where the
+ * buffer should be reversed
+ * @deprecated use {@link #reverse(Version, char[], int, int)} instead. This
+ * method will be removed in Lucene 4.0
+ */
+ public static void reverse(char[] buffer, int start, int len ) {
+ reverseUnicode3(buffer, start, len);
+ }
+
+ /**
+ * @deprecated Remove this when support for 3.0 indexes is no longer needed.
+ */
+ private static void reverseUnicode3( char[] buffer, int start, int len ){
if( len <= 1 ) return;
int num = len>>1;
for( int i = start; i < ( start + num ); i++ ){
@@ -128,4 +245,77 @@
buffer[start * 2 + len - i - 1] = c;
}
}
+
+ /**
+ * Partially reverses the given input buffer in-place from the given offset
+ * up to the given length.
+ * @param matchVersion See <a href="#version">above</a>
+ * @param buffer the input char array to reverse
+ * @param start the offset from where to reverse the buffer
+ * @param len the length in the buffer up to where the
+ * buffer should be reversed
+ */
+ public static void reverse(Version matchVersion, final char[] buffer,
+ final int start, final int len) {
+ if (!matchVersion.onOrAfter(Version.LUCENE_31)) {
+ reverseUnicode3(buffer, start, len);
+ return;
+ }
+ /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
+ if (len < 2)
+ return;
+ int end = (start + len) - 1;
+ char frontHigh = buffer[start];
+ char endLow = buffer[end];
+ boolean allowFrontSur = true, allowEndSur = true;
+ final int mid = start + (len >> 1);
+ for (int i = start; i < mid; ++i, --end) {
+ final char frontLow = buffer[i + 1];
+ final char endHigh = buffer[end - 1];
+ final boolean surAtFront = allowFrontSur
+ && Character.isSurrogatePair(frontHigh, frontLow);
+ if (surAtFront && (len < 3)) {
+ // nothing to do since surAtFront is allowed and 1 char left
+ return;
+ }
+ final boolean surAtEnd = allowEndSur
+ && Character.isSurrogatePair(endHigh, endLow);
+ allowFrontSur = allowEndSur = true;
+ if (surAtFront == surAtEnd) {
+ if (surAtFront) {
+ // both surrogates
+ buffer[end] = frontLow;
+ buffer[--end] = frontHigh;
+ buffer[i] = endHigh;
+ buffer[++i] = endLow;
+ frontHigh = buffer[i + 1];
+ endLow = buffer[end - 1];
+ } else {
+ // neither surrogates
+ buffer[end] = frontHigh;
+ buffer[i] = endLow;
+ frontHigh = frontLow;
+ endLow = endHigh;
+ }
+ } else {
+ if (surAtFront) {
+ // surrogate only at the front
+ buffer[end] = frontLow;
+ buffer[i] = endLow;
+ endLow = endHigh;
+ allowFrontSur = false;
+ } else {
+ // surrogate only at the end
+ buffer[end] = frontHigh;
+ buffer[i] = endHigh;
+ frontHigh = frontLow;
+ allowEndSur = false;
+ }
+ }
+ }
+ if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
+ // only if odd length
+ buffer[end] = allowFrontSur ? endLow : frontHigh;
+ }
+ }
}
Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java?rev=883149&r1=883148&r2=883149&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilter.java Sun Nov 22 21:09:42 2009
@@ -23,6 +23,7 @@
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
@@ -73,4 +74,47 @@
ReverseStringFilter.reverse( buffer, 2, 3 );
assertEquals( "ABEDCF", new String( buffer ) );
}
+
+ /**
+ * Test the broken 3.0 behavior, for back compat
+ */
+ public void testBackCompat() throws Exception {
+ assertEquals("\uDF05\uD866\uDF05\uD866", ReverseStringFilter.reverse("ð©¬
ð©¬
"));
+ }
+
+ public void testReverseSupplementary() throws Exception {
+ // supplementary at end
+ assertEquals("ð©¬
è±éä¹æ¯ç", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "çæ¯ä¹éè±ð©¬
"));
+ // supplementary at end - 1
+ assertEquals("að©¬
è±éä¹æ¯ç", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "çæ¯ä¹éè±ð©¬
a"));
+ // supplementary at start
+ assertEquals("fedcbað©¬
", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "ð©¬
abcdef"));
+ // supplementary at start + 1
+ assertEquals("fedcbað©¬
z", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "zð©¬
abcdef"));
+ // supplementary medial
+ assertEquals("gfeð©¬
dcba", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "abcdð©¬
efg"));
+ }
+
+ public void testReverseSupplementaryChar() throws Exception {
+ // supplementary at end
+ char[] buffer = "abcçæ¯ä¹éè±ð©¬
".toCharArray();
+ ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
+ assertEquals("abcð©¬
è±éä¹æ¯ç", new String(buffer));
+ // supplementary at end - 1
+ buffer = "abcçæ¯ä¹éè±ð©¬
d".toCharArray();
+ ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
+ assertEquals("abcdð©¬
è±éä¹æ¯ç", new String(buffer));
+ // supplementary at start
+ buffer = "abcð©¬
çæ¯ä¹éè±".toCharArray();
+ ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
+ assertEquals("abcè±éä¹æ¯çð©¬
", new String(buffer));
+ // supplementary at start + 1
+ buffer = "abcdð©¬
çæ¯ä¹éè±".toCharArray();
+ ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
+ assertEquals("abcè±éä¹æ¯çð©¬
d", new String(buffer));
+ // supplementary medial
+ buffer = "abcçæ¯ð©¬
def".toCharArray();
+ ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
+ assertEquals("abcfedð©¬
æ¯ç", new String(buffer));
+ }
}