You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2009/11/23 17:09:05 UTC
svn commit: r883386 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
src/java/org/apache/solr/search/SolrQueryParser.java
src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
Author: yonik
Date: Mon Nov 23 16:09:05 2009
New Revision: 883386
URL: http://svn.apache.org/viewvc?rev=883386&view=rev
Log:
SOLR-1593: fix reverse wildcard filter for surrogate pairs
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Nov 23 16:09:05 2009
@@ -74,6 +74,11 @@
fl=score to the parameter list instead of appending score to the
existing field list. (yonik)
+* SOLR-1593: ReverseWildcardFilter didn't work for surrogate pairs
+ (i.e. code points outside of the BMP), resulting in incorrect
+ matching. This change requires reindexing for any content with
+ such characters. (Robert Muir, yonik)
+
Other Changes
----------------------
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java Mon Nov 23 16:09:05 2009
@@ -20,7 +20,6 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -73,13 +72,79 @@
}
char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
buffer[oldLen] = markerChar;
- //String reversed = reverseAndMark(value, markerChar);
- ReverseStringFilter.reverse(buffer, oldLen + 1);
+ reverse(buffer, 0, oldLen + 1);
posAtt.setPositionIncrement(origOffset);
termAtt.setTermBuffer(buffer, 0, oldLen +1);
return true;
}
-
+
+ /**
+ * Partially reverses the given input buffer in-place from the given offset
+ * up to the given length, keeping surrogate pairs in the correct (non-reversed) order.
+ * @param buffer the input char array to reverse
+ * @param start the offset from where to reverse the buffer
+ * @param len the length in the buffer up to where the
+ * buffer should be reversed
+ */
+ public static void reverse(final char[] buffer, final int start, final int len) {
+ /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
+ if (len < 2)
+ return;
+ int end = (start + len) - 1;
+ char frontHigh = buffer[start];
+ char endLow = buffer[end];
+ boolean allowFrontSur = true, allowEndSur = true;
+ final int mid = start + (len >> 1);
+ for (int i = start; i < mid; ++i, --end) {
+ final char frontLow = buffer[i + 1];
+ final char endHigh = buffer[end - 1];
+ final boolean surAtFront = allowFrontSur
+ && Character.isSurrogatePair(frontHigh, frontLow);
+ if (surAtFront && (len < 3)) {
+ // nothing to do since surAtFront is allowed and 1 char left
+ return;
+ }
+ final boolean surAtEnd = allowEndSur
+ && Character.isSurrogatePair(endHigh, endLow);
+ allowFrontSur = allowEndSur = true;
+ if (surAtFront == surAtEnd) {
+ if (surAtFront) {
+ // both surrogates
+ buffer[end] = frontLow;
+ buffer[--end] = frontHigh;
+ buffer[i] = endHigh;
+ buffer[++i] = endLow;
+ frontHigh = buffer[i + 1];
+ endLow = buffer[end - 1];
+ } else {
+ // neither surrogates
+ buffer[end] = frontHigh;
+ buffer[i] = endLow;
+ frontHigh = frontLow;
+ endLow = endHigh;
+ }
+ } else {
+ if (surAtFront) {
+ // surrogate only at the front
+ buffer[end] = frontLow;
+ buffer[i] = endLow;
+ endLow = endHigh;
+ allowFrontSur = false;
+ } else {
+ // surrogate only at the end
+ buffer[end] = frontHigh;
+ buffer[i] = endHigh;
+ frontHigh = frontLow;
+ allowEndSur = false;
+ }
+ }
+ }
+ if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
+ // only if odd length
+ buffer[end] = allowFrontSur ? endLow : frontHigh;
+ }
+ }
+
}
Modified: lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/search/SolrQueryParser.java Mon Nov 23 16:09:05 2009
@@ -27,10 +27,7 @@
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
-import org.apache.solr.analysis.ReversedWildcardFilter;
-import org.apache.solr.analysis.ReversedWildcardFilterFactory;
-import org.apache.solr.analysis.TokenFilterFactory;
-import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
@@ -193,7 +190,12 @@
String type = schema.getFieldType(field).getTypeName();
ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
if (factory != null && factory.shouldReverse(termStr)) {
- termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
+ int len = termStr.length();
+ char[] chars = new char[len+1];
+ chars[0] = factory.getMarkerChar();
+ termStr.getChars(0, len, chars, 1);
+ ReversedWildcardFilter.reverse(chars, 1, len);
+ termStr = new String(chars);
}
Query q = super.getWildcardQuery(field, termStr);
if (q instanceof WildcardQuery) {
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java?rev=883386&r1=883385&r2=883386&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java Mon Nov 23 16:09:05 2009
@@ -77,8 +77,8 @@
public void testIndexingAnalysis() throws Exception {
Analyzer a = schema.getAnalyzer();
- String text = "one two three";
- String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
+ String text = "one two three si\uD834\uDD1Ex";
+ String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens1 = getTokens(
new WhitespaceTokenizer(new StringReader(expected1)));
// set positionIncrements and offsets in expected tokens
@@ -86,10 +86,10 @@
Token t = expectedTokens1.get(i);
t.setPositionIncrement(0);
}
- String expected2 = "\u0001eno \u0001owt \u0001eerht";
+ String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens2 = getTokens(
new WhitespaceTokenizer(new StringReader(expected2)));
- String expected3 = "one two three";
+ String expected3 = "one two three si\uD834\uDD1Ex";
List<Token> expectedTokens3 = getTokens(
new WhitespaceTokenizer(new StringReader(expected3)));
// field one
@@ -116,10 +116,10 @@
// XXX note: this should be false, but for now we return true for any field,
// XXX if at least one field uses the reversing
assertTrue(parserThree.getAllowLeadingWildcard());
- String text = "one +two *hree f*ur fiv*";
- String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
- String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
- String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
+ String text = "one +two *hree f*ur fiv* *si\uD834\uDD1Ex";
+ String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv* one:\u0001x\uD834\uDD1Eis*";
+ String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv* two:\u0001x\uD834\uDD1Eis*";
+ String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv* three:*si\uD834\uDD1Ex";
Query q = parserOne.parse(text);
assertEquals(expectedOne, q.toString());
q = parserTwo.parse(text);