You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/11/28 12:22:17 UTC

svn commit: r1207084 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/src/ lucene/backwards/src/test-framework/ lucene/backwards/src/test/ solr/ solr/contrib/analysis-extras/ solr/contrib/analysis-extras/src/java/org/apache/solr/analysi...

Author: rmuir
Date: Mon Nov 28 11:22:16 2011
New Revision: 1207084

URL: http://svn.apache.org/viewvc?rev=1207084&view=rev
Log:
SOLR-2919: Localized rangequery support for icu/jdk collation filters, parametric tailoring for icu collators

Added:
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml
      - copied, changed from r1207070, lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/solrconfig-icucollate.xml   (props changed)
      - copied unchanged from r1207078, lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/solrconfig-icucollate.xml
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java
      - copied unchanged from r1207070, lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java
    lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml
      - copied, changed from r1207070, lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml
    lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/solrconfig-basic.xml   (props changed)
      - copied unchanged from r1207077, lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-basic.xml
    lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java
      - copied unchanged from r1207070, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java
Removed:
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/empty
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/src/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/src/test/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/src/test-framework/   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
    lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
    lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
    lucene/dev/branches/branch_3x/solr/solrj/   (props changed)

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Mon Nov 28 11:22:16 2011
@@ -390,6 +390,8 @@ New Features
   a complete analysis chain for multiterm queries.   
   (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
 
+* SOLR-2919: Added support for localized range queries when the analysis chain uses 
+  CollationKeyFilter or ICUCollationKeyFilter.  (Michael Sokolov, rmuir)
 
 Bug Fixes
 ----------------------

Modified: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt Mon Nov 28 11:22:16 2011
@@ -15,6 +15,13 @@ $Id$
 
 (No Changes)
 
+==================  3.6.0 ==================
+
+* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.
+  These can be used to customize range query/sort behavior, for example to
+  support numeric collation, ignore punctuation/whitespace, ignore accents but
+  not case, control whether upper/lowercase values are sorted first, etc.  (rmuir)
+
 ==================  3.5.0 ==================
 
 (No Changes)

Modified: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java Mon Nov 28 11:22:16 2011
@@ -53,12 +53,21 @@ import com.ibm.icu.util.ULocale;
  *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
  *  <li>decomposition: 'no' or 'canonical' (optional)
  * </ul>
+ * <p>
+ * Expert options:
+ * <ul>
+ *  <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ *  <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ *  <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ *  <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ *  <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
  *
  * @see Collator
  * @see ULocale
  * @see RuleBasedCollator
  */
-public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
   private Collator collator;
 
   public void inform(ResourceLoader loader) {
@@ -66,6 +75,12 @@ public class ICUCollationKeyFilterFactor
     String localeID = args.get("locale");
     String strength = args.get("strength");
     String decomposition = args.get("decomposition");
+
+    String alternate = args.get("alternate");
+    String caseLevel = args.get("caseLevel");
+    String caseFirst = args.get("caseFirst");
+    String numeric = args.get("numeric");
+    String variableTop = args.get("variableTop");
     
     if (custom == null && localeID == null)
       throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@@ -108,6 +123,36 @@ public class ICUCollationKeyFilterFactor
       else
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
     }
+    
+    // expert options: concrete subclasses are always a RuleBasedCollator
+    RuleBasedCollator rbc = (RuleBasedCollator) collator;
+    if (alternate != null) {
+      if (alternate.equalsIgnoreCase("shifted")) {
+        rbc.setAlternateHandlingShifted(true);
+      } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+        rbc.setAlternateHandlingShifted(false);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+      }
+    }
+    if (caseLevel != null) {
+      rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+    }
+    if (caseFirst != null) {
+      if (caseFirst.equalsIgnoreCase("lower")) {
+        rbc.setLowerCaseFirst(true);
+      } else if (caseFirst.equalsIgnoreCase("upper")) {
+        rbc.setUpperCaseFirst(true);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+      }
+    }
+    if (numeric != null) {
+      rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+    }
+    if (variableTop != null) {
+      rbc.setVariableTop(variableTop);
+    }
   }
   
   public TokenStream create(TokenStream input) {
@@ -139,4 +184,9 @@ public class ICUCollationKeyFilterFactor
       IOUtils.closeQuietly(input);
     }
   }
+  
+  //@Override
+  public Object getMultiTermComponent() {
+    return this;
+  }
 }

Copied: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (from r1207070, lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml?p2=lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml&p1=lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml&r1=1207070&r2=1207084&rev=1207084&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml Mon Nov 28 11:22:16 2011
@@ -16,7 +16,7 @@
  limitations under the License.
 -->
 
-<!-- Test schema file for ICUCollationKeyFilter (deprecated: use ICUCollationField instead) -->
+<!-- Test schema file for ICUCollationKeyFilter -->
 
 <schema name="test" version="1.0">
   <types>

Modified: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java Mon Nov 28 11:22:16 2011
@@ -97,6 +97,133 @@ public class TestICUCollationKeyFilterFa
         new KeywordTokenizer(new StringReader(lowerCase)));
     assertCollatesToSame(tsUpper, tsLower);
   }
+  
+  /*
+   * Setting alternate=shifted to shift whitespace, punctuation and symbols
+   * to quaternary level 
+   */
+  public void testIgnorePunctuation() throws IOException {
+    String withPunctuation = "foo-bar";
+    String withoutPunctuation = "foo bar";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "primary");
+    args.put("alternate", "shifted");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsPunctuation = factory.create(
+        new KeywordTokenizer(new StringReader(withPunctuation)));
+    TokenStream tsWithoutPunctuation = factory.create(
+        new KeywordTokenizer(new StringReader(withoutPunctuation)));
+    assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
+  }
+  
+  /*
+   * Setting alternate=shifted and variableTop to shift whitespace, but not 
+   * punctuation or symbols, to quaternary level 
+   */
+  public void testIgnoreWhitespace() throws IOException {
+    String withSpace = "foo bar";
+    String withoutSpace = "foobar";
+    String withPunctuation = "foo-bar";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "primary");
+    args.put("alternate", "shifted");
+    args.put("variableTop", " ");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsWithSpace = factory.create(
+        new KeywordTokenizer(new StringReader(withSpace)));
+    TokenStream tsWithoutSpace = factory.create(
+        new KeywordTokenizer(new StringReader(withoutSpace)));
+    assertCollatesToSame(tsWithSpace, tsWithoutSpace);
+    // now assert that punctuation still matters: foo-bar < foo bar
+    tsWithSpace = factory.create(
+        new KeywordTokenizer(new StringReader(withSpace)));
+    TokenStream tsWithPunctuation = factory.create(
+        new KeywordTokenizer(new StringReader(withPunctuation)));
+    assertCollation(tsWithPunctuation, tsWithSpace, -1);
+  }
+  
+  /*
+   * Setting numeric to encode digits with numeric value, so that
+   * foobar-9 sorts before foobar-10
+   */
+  public void testNumerics() throws IOException {
+    String nine = "foobar-9";
+    String ten = "foobar-10";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("numeric", "true");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsNine = factory.create(
+        new KeywordTokenizer(new StringReader(nine)));
+    TokenStream tsTen = factory.create(
+        new KeywordTokenizer(new StringReader(ten)));
+    assertCollation(tsNine, tsTen, -1);
+  }
+  
+  /*
+   * Setting caseLevel=true to create an additional case level between
+   * secondary and tertiary
+   */
+  public void testIgnoreAccentsButNotCase() throws IOException {
+    String withAccents = "résumé";
+    String withoutAccents = "resume";
+    String withAccentsUpperCase = "Résumé";
+    String withoutAccentsUpperCase = "Resume";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "primary");
+    args.put("caseLevel", "true");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsWithAccents = factory.create(
+        new KeywordTokenizer(new StringReader(withAccents)));
+    TokenStream tsWithoutAccents = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccents)));
+    assertCollatesToSame(tsWithAccents, tsWithoutAccents);
+    
+    TokenStream tsWithAccentsUpperCase = factory.create(
+        new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
+    TokenStream tsWithoutAccentsUpperCase = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+    assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
+    
+    // now assert that case still matters: resume < Resume
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccents)));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+    assertCollation(tsLower, tsUpper, -1);
+  }
+  
+  /*
+   * Setting caseFirst=upper to cause uppercase strings to sort
+   * before lowercase ones.
+   */
+  public void testUpperCaseFirst() throws IOException {
+    String lower = "resume";
+    String upper = "Resume";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "tertiary");
+    args.put("caseFirst", "upper");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(lower)));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(upper)));
+    assertCollation(tsUpper, tsLower, -1);
+  }
 
   /*
    * For german, you might want oe to sort and match with o umlaut.
@@ -155,15 +282,18 @@ public class TestICUCollationKeyFilterFa
     }
   }
   
-  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
-      throws IOException {
+  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
+    assertCollation(stream1, stream2, 0);
+  }
+  
+  private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
     CharTermAttribute term1 = stream1
         .addAttribute(CharTermAttribute.class);
     CharTermAttribute term2 = stream2
         .addAttribute(CharTermAttribute.class);
     assertTrue(stream1.incrementToken());
     assertTrue(stream2.incrementToken());
-    assertEquals(term1.toString(), term2.toString());
+    assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
     assertFalse(stream1.incrementToken());
     assertFalse(stream2.incrementToken());
   }

Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java Mon Nov 28 11:22:16 2011
@@ -70,7 +70,7 @@ import org.apache.solr.util.plugin.Resou
  * @see RuleBasedCollator
  * @since solr 3.1
  */
-public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
   private Collator collator;
 
   public void inform(ResourceLoader loader) {
@@ -169,4 +169,9 @@ public class CollationKeyFilterFactory e
       IOUtils.closeQuietly(input);
     }
   }
+  
+  //@Override
+  public Object getMultiTermComponent() {
+    return this;
+  }
 }

Copied: lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (from r1207070, lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml?p2=lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml&p1=lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml&r1=1207070&r2=1207084&rev=1207084&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml Mon Nov 28 11:22:16 2011
@@ -16,7 +16,7 @@
  limitations under the License.
 -->
 
-<!-- Test schema file for CollationKeyFilter (deprecated: use CollationField instead) -->
+<!-- Test schema file for CollationKeyFilter -->
 
 <schema name="test" version="1.0">
   <types>