You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/11/28 12:03:27 UTC

svn commit: r1207070 - in /lucene/dev/trunk/solr: ./ contrib/analysis-extras/ contrib/analysis-extras/src/java/org/apache/solr/analysis/ contrib/analysis-extras/src/java/org/apache/solr/schema/ contrib/analysis-extras/src/test-files/analysis-extras/sol...

Author: rmuir
Date: Mon Nov 28 11:03:24 2011
New Revision: 1207070

URL: http://svn.apache.org/viewvc?rev=1207070&view=rev
Log:
SOLR-2919: Localized rangequery support for icu/jdk collation filters, parametric tailoring for icu collators

Added:
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java   (with props)
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java   (with props)
    lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml   (with props)
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java   (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
    lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
    lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon Nov 28 11:03:24 2011
@@ -395,6 +395,8 @@ New Features
   "multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
   specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
 
+* SOLR-2919: Added support for localized range queries when the analysis chain uses 
+  CollationKeyFilter or ICUCollationKeyFilter.  (Michael Sokolov, rmuir)
 
 Bug Fixes
 ----------------------

Modified: lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt Mon Nov 28 11:03:24 2011
@@ -17,6 +17,13 @@ $Id$
   the Solr 3.x ICUCollationKeyFilterFactory, and also supports
   Locale-sensitive range queries.  (rmuir)
 
+==================  3.6.0 ==================
+
+* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.
+  These can be used to customize range query/sort behavior, for example to
+  support numeric collation, ignore punctuation/whitespace, ignore accents but
+  not case, control whether upper/lowercase values are sorted first, etc.  (rmuir)
+
 ==================  3.5.0 ==================
 
 (No Changes)

Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java Mon Nov 28 11:03:24 2011
@@ -53,6 +53,15 @@ import com.ibm.icu.util.ULocale;
  *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
  *  <li>decomposition: 'no' or 'canonical' (optional)
  * </ul>
+ * <p>
+ * Expert options:
+ * <ul>
+ *  <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ *  <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ *  <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ *  <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ *  <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
  *
  * @see Collator
  * @see ULocale
@@ -60,7 +69,7 @@ import com.ibm.icu.util.ULocale;
  * @deprecated use {@link org.apache.solr.schema.ICUCollationField} instead.
  */
 @Deprecated
-public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
   private Collator collator;
 
   public void inform(ResourceLoader loader) {
@@ -68,6 +77,12 @@ public class ICUCollationKeyFilterFactor
     String localeID = args.get("locale");
     String strength = args.get("strength");
     String decomposition = args.get("decomposition");
+
+    String alternate = args.get("alternate");
+    String caseLevel = args.get("caseLevel");
+    String caseFirst = args.get("caseFirst");
+    String numeric = args.get("numeric");
+    String variableTop = args.get("variableTop");
     
     if (custom == null && localeID == null)
       throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@@ -110,6 +125,36 @@ public class ICUCollationKeyFilterFactor
       else
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
     }
+    
+    // expert options: concrete subclasses are always a RuleBasedCollator
+    RuleBasedCollator rbc = (RuleBasedCollator) collator;
+    if (alternate != null) {
+      if (alternate.equalsIgnoreCase("shifted")) {
+        rbc.setAlternateHandlingShifted(true);
+      } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+        rbc.setAlternateHandlingShifted(false);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+      }
+    }
+    if (caseLevel != null) {
+      rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+    }
+    if (caseFirst != null) {
+      if (caseFirst.equalsIgnoreCase("lower")) {
+        rbc.setLowerCaseFirst(true);
+      } else if (caseFirst.equalsIgnoreCase("upper")) {
+        rbc.setUpperCaseFirst(true);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+      }
+    }
+    if (numeric != null) {
+      rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+    }
+    if (variableTop != null) {
+      rbc.setVariableTop(variableTop);
+    }
   }
   
   public TokenStream create(TokenStream input) {
@@ -141,4 +186,9 @@ public class ICUCollationKeyFilterFactor
       IOUtils.closeQuietly(input);
     }
   }
+  
+  @Override
+  public Object getMultiTermComponent() {
+    return this;
+  }
 }

Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java Mon Nov 28 11:03:24 2011
@@ -66,7 +66,16 @@ import com.ibm.icu.util.ULocale;
  *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
  *  <li>decomposition: 'no' or 'canonical' (optional)
  * </ul>
- *
+ * <p>
+ * Expert options:
+ * <ul>
+ *  <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ *  <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ *  <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ *  <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ *  <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
+ * 
  * @see Collator
  * @see ULocale
  * @see RuleBasedCollator
@@ -90,6 +99,12 @@ public class ICUCollationField extends F
     String strength = args.remove("strength");
     String decomposition = args.remove("decomposition");
     
+    String alternate = args.remove("alternate");
+    String caseLevel = args.remove("caseLevel");
+    String caseFirst = args.remove("caseFirst");
+    String numeric = args.remove("numeric");
+    String variableTop = args.remove("variableTop");
+
     if (custom == null && localeID == null)
       throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
     
@@ -133,6 +148,37 @@ public class ICUCollationField extends F
       else
         throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
     }
+    
+    // expert options: concrete subclasses are always a RuleBasedCollator
+    RuleBasedCollator rbc = (RuleBasedCollator) collator;
+    if (alternate != null) {
+      if (alternate.equalsIgnoreCase("shifted")) {
+        rbc.setAlternateHandlingShifted(true);
+      } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+        rbc.setAlternateHandlingShifted(false);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+      }
+    }
+    if (caseLevel != null) {
+      rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+    }
+    if (caseFirst != null) {
+      if (caseFirst.equalsIgnoreCase("lower")) {
+        rbc.setLowerCaseFirst(true);
+      } else if (caseFirst.equalsIgnoreCase("upper")) {
+        rbc.setUpperCaseFirst(true);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+      }
+    }
+    if (numeric != null) {
+      rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+    }
+    if (variableTop != null) {
+      rbc.setVariableTop(variableTop);
+    }
+
     // we use 4.0 because it ensures we just encode the pure byte[] keys.
     analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
   }

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml Mon Nov 28 11:03:24 2011
@@ -0,0 +1,61 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for ICUCollationKeyFilter (deprecated: use ICUCollationField instead) -->
+
+<schema name="test" version="1.0">
+  <types>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- basic text field -->
+    <fieldtype name="text" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+    
+    <fieldtype name="sort_ar_t" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.ICUCollationKeyFilterFactory" locale="ar"/>
+      </analyzer>
+    </fieldtype>
+    
+    <fieldtype name="sort_de_t" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.ICUCollationKeyFilterFactory" locale="de" strength="primary"/>
+      </analyzer>
+    </fieldtype>
+  </types>
+
+  <fields>
+    <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+    <field name="text" type="text" indexed="true" stored="false"/>
+    <field name="sort_ar"       type="sort_ar_t"       indexed="true" stored="false" multiValued="false"/>
+    <field name="sort_de"       type="sort_de_t"       indexed="true" stored="false" multiValued="false"/>
+  </fields>
+
+  <defaultSearchField>text</defaultSearchField>
+  <uniqueKey>id</uniqueKey>
+
+  <!-- copy our text to some sort fields with different orders -->
+  <copyField source="text" dest="sort_ar"/>
+  <copyField source="text" dest="sort_de"/>
+</schema>

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml Mon Nov 28 11:03:24 2011
@@ -0,0 +1,69 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for CollationField options -->
+
+<schema name="test" version="1.0">
+  <types>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- basic text field -->
+    <fieldtype name="text" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+    
+    <!-- ignores punctuation and whitespace -->
+    <fieldtype name="sort_ignore_punctuation_t" class="solr.ICUCollationField" 
+               locale="en" strength="primary" alternate="shifted"/>
+    <!-- ignores only whitespace -->
+    <fieldtype name="sort_ignore_space_t" class="solr.ICUCollationField" 
+               locale="en" strength="primary" alternate="shifted" variableTop=" "/>
+    <!-- ignores only accents, but not case -->
+    <fieldtype name="sort_ignore_accents_t" class="solr.ICUCollationField"
+               locale="en" strength="primary" caseLevel="true"/>
+    <!-- sorts numerics in numeric order -->
+    <fieldtype name="sort_numerics_t" class="solr.ICUCollationField" 
+               locale="en" numeric="true"/>
+    <!-- sorts uppercase before lowercase -->
+    <fieldtype name="sort_uppercase_first_t" class="solr.ICUCollationField" 
+               locale="en" strength="tertiary" caseFirst="upper"/>
+  </types>
+
+  <fields>
+    <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+    <field name="text" type="text" indexed="true" stored="false"/>
+    <field name="sort_ignore_punctuation" type="sort_ignore_punctuation_t" indexed="true" stored="false" multiValued="false"/>
+    <field name="sort_ignore_space"       type="sort_ignore_space_t"       indexed="true" stored="false" multiValued="false"/>
+    <field name="sort_ignore_accents"     type="sort_ignore_accents_t"     indexed="true" stored="false" multiValued="false"/>
+    <field name="sort_numerics"           type="sort_numerics_t"           indexed="true" stored="false" multiValued="false"/>
+    <field name="sort_uppercase_first"    type="sort_uppercase_first_t"    indexed="true" stored="false" multiValued="false"/>
+  </fields>
+
+  <defaultSearchField>text</defaultSearchField>
+  <uniqueKey>id</uniqueKey>
+
+  <!-- copy our text to some sort fields with different orders -->
+  <copyField source="text" dest="sort_ignore_punctuation"/>
+  <copyField source="text" dest="sort_ignore_space"/>
+  <copyField source="text" dest="sort_ignore_accents"/>
+  <copyField source="text" dest="sort_numerics"/>
+  <copyField source="text" dest="sort_uppercase_first"/>
+</schema>

Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java Mon Nov 28 11:03:24 2011
@@ -98,6 +98,133 @@ public class TestICUCollationKeyFilterFa
         new KeywordTokenizer(new StringReader(lowerCase)));
     assertCollatesToSame(tsUpper, tsLower);
   }
+  
+  /*
+   * Setting alternate=shifted to shift whitespace, punctuation and symbols
+   * to quaternary level 
+   */
+  public void testIgnorePunctuation() throws IOException {
+    String withPunctuation = "foo-bar";
+    String withoutPunctuation = "foo bar";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "primary");
+    args.put("alternate", "shifted");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsPunctuation = factory.create(
+        new KeywordTokenizer(new StringReader(withPunctuation)));
+    TokenStream tsWithoutPunctuation = factory.create(
+        new KeywordTokenizer(new StringReader(withoutPunctuation)));
+    assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
+  }
+  
+  /*
+   * Setting alternate=shifted and variableTop to shift whitespace, but not 
+   * punctuation or symbols, to quaternary level 
+   */
+  public void testIgnoreWhitespace() throws IOException {
+    String withSpace = "foo bar";
+    String withoutSpace = "foobar";
+    String withPunctuation = "foo-bar";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "primary");
+    args.put("alternate", "shifted");
+    args.put("variableTop", " ");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsWithSpace = factory.create(
+        new KeywordTokenizer(new StringReader(withSpace)));
+    TokenStream tsWithoutSpace = factory.create(
+        new KeywordTokenizer(new StringReader(withoutSpace)));
+    assertCollatesToSame(tsWithSpace, tsWithoutSpace);
+    // now assert that punctuation still matters: foo-bar < foo bar
+    tsWithSpace = factory.create(
+        new KeywordTokenizer(new StringReader(withSpace)));
+    TokenStream tsWithPunctuation = factory.create(
+        new KeywordTokenizer(new StringReader(withPunctuation)));
+    assertCollation(tsWithPunctuation, tsWithSpace, -1);
+  }
+  
+  /*
+   * Setting numeric to encode digits with numeric value, so that
+   * foobar-9 sorts before foobar-10
+   */
+  public void testNumerics() throws IOException {
+    String nine = "foobar-9";
+    String ten = "foobar-10";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("numeric", "true");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsNine = factory.create(
+        new KeywordTokenizer(new StringReader(nine)));
+    TokenStream tsTen = factory.create(
+        new KeywordTokenizer(new StringReader(ten)));
+    assertCollation(tsNine, tsTen, -1);
+  }
+  
+  /*
+   * Setting caseLevel=true to create an additional case level between
+   * secondary and tertiary
+   */
+  public void testIgnoreAccentsButNotCase() throws IOException {
+    String withAccents = "résumé";
+    String withoutAccents = "resume";
+    String withAccentsUpperCase = "Résumé";
+    String withoutAccentsUpperCase = "Resume";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "primary");
+    args.put("caseLevel", "true");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsWithAccents = factory.create(
+        new KeywordTokenizer(new StringReader(withAccents)));
+    TokenStream tsWithoutAccents = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccents)));
+    assertCollatesToSame(tsWithAccents, tsWithoutAccents);
+    
+    TokenStream tsWithAccentsUpperCase = factory.create(
+        new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
+    TokenStream tsWithoutAccentsUpperCase = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+    assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
+    
+    // now assert that case still matters: resume < Resume
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccents)));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+    assertCollation(tsLower, tsUpper, -1);
+  }
+  
+  /*
+   * Setting caseFirst=upper to cause uppercase strings to sort
+   * before lowercase ones.
+   */
+  public void testUpperCaseFirst() throws IOException {
+    String lower = "resume";
+    String upper = "Resume";
+    ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("locale", "en");
+    args.put("strength", "tertiary");
+    args.put("caseFirst", "upper");
+    factory.init(args);
+    factory.inform(new StringMockSolrResourceLoader(""));
+    TokenStream tsLower = factory.create(
+        new KeywordTokenizer(new StringReader(lower)));
+    TokenStream tsUpper = factory.create(
+        new KeywordTokenizer(new StringReader(upper)));
+    assertCollation(tsUpper, tsLower, -1);
+  }
 
   /*
    * For german, you might want oe to sort and match with o umlaut.
@@ -156,15 +283,18 @@ public class TestICUCollationKeyFilterFa
     }
   }
   
-  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
-      throws IOException {
+  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
+    assertCollation(stream1, stream2, 0);
+  }
+  
+  private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
     CharTermAttribute term1 = stream1
         .addAttribute(CharTermAttribute.class);
     CharTermAttribute term2 = stream2
         .addAttribute(CharTermAttribute.class);
     assertTrue(stream1.incrementToken());
     assertTrue(stream2.incrementToken());
-    assertEquals(term1.toString(), term2.toString());
+    assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
     assertFalse(stream1.incrementToken());
     assertFalse(stream2.incrementToken());
   }

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java Mon Nov 28 11:03:24 2011
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests {@link ICUCollationKeyFilterFactory} with RangeQueries
+ */
+public class TestICUCollationKeyRangeQueries extends SolrTestCaseJ4 {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-icucollate.xml","schema-icucollatefilter.xml", "analysis-extras/solr");
+    // add some docs
+    assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+    assertU(adoc("id", "2", "text", "I WÄ°LL USE TURKÄ°SH CASING"));
+    assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+    assertU(adoc("id", "4", "text", "Töne"));
+    assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKÄ°SH CASING"));
+    assertU(adoc("id", "6", "text", "Testing"));
+    assertU(adoc("id", "7", "text", "Tone"));
+    assertU(adoc("id", "8", "text", "Testing"));
+    assertU(adoc("id", "9", "text", "testing"));
+    assertU(adoc("id", "10", "text", "toene"));
+    assertU(adoc("id", "11", "text", "Tzne"));
+    assertU(adoc("id", "12", "text", "\u0698\u0698"));
+    assertU(commit());
+  }
+  
+  /** 
+   * Test termquery with german DIN 5007-1 primary strength.
+   * In this case, ö is equivalent to o (but not oe) 
+   */
+  public void testBasicTermQuery() {
+    assertQ("Collated TQ: ",
+       req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+              "//*[@numFound='2']",
+              "//result/doc[1]/int[@name='id'][.=4]",
+              "//result/doc[2]/int[@name='id'][.=7]"
+    );
+  }
+  
+  /** 
+   * Test rangequery again with the DIN 5007-1 collator.
+   * We do a range query of tone .. tp, in binary order this
+   * would retrieve nothing due to case and accent differences.
+   */
+  public void testBasicRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=4]",
+               "//result/doc[2]/int[@name='id'][.=7]"
+     );
+  }
+
+  /** 
+   * Test rangequery again with an Arabic collator.
+   * Binary order would normally order U+0633 in this range.
+   */
+  public void testNegativeRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+               "//*[@numFound='0']"
+     );
+  }
+}

Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java Mon Nov 28 11:03:24 2011
@@ -0,0 +1,117 @@
+package org.apache.solr.schema;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests expert options of {@link ICUCollationField}.
+ */
+public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    assumeFalse("preflex format only supports UTF-8 encoded bytes", "Lucene3x".equals(Codec.getDefault().getName()));
+    initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml", "analysis-extras/solr");
+    // add some docs
+    assertU(adoc("id", "1", "text", "foo-bar"));
+    assertU(adoc("id", "2", "text", "foo bar"));
+    assertU(adoc("id", "3", "text", "foobar"));
+    assertU(adoc("id", "4", "text", "foobar-10"));
+    assertU(adoc("id", "5", "text", "foobar-9"));
+    assertU(adoc("id", "6", "text", "resume"));
+    assertU(adoc("id", "7", "text", "Résumé"));
+    assertU(adoc("id", "8", "text", "Resume"));
+    assertU(adoc("id", "9", "text", "résumé"));
+    assertU(commit());
+  }
+  
+  /*
+   * Setting alternate=shifted to shift whitespace, punctuation and symbols
+   * to quaternary level 
+   */
+  public void testIgnorePunctuation() { 
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id asc" ),
+               "//*[@numFound='3']",
+               "//result/doc[1]/int[@name='id'][.=1]",
+               "//result/doc[2]/int[@name='id'][.=2]",
+               "//result/doc[3]/int[@name='id'][.=3]"
+     );
+  }
+  
+  /*
+   * Setting alternate=shifted and variableTop to shift whitespace, but not 
+   * punctuation or symbols, to quaternary level 
+   */
+  public void testIgnoreWhitespace() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=2]",
+               "//result/doc[2]/int[@name='id'][.=3]"
+     );
+  }
+  
+  /*
+   * Setting numeric to encode digits with numeric value, so that
+   * foobar-9 sorts before foobar-10
+   */
+  public void testNumerics() {
+    assertQ("Collated sort: ",
+        req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=5]",
+               "//result/doc[2]/int[@name='id'][.=4]"
+     );
+  }
+  
+  /*
+   * Setting caseLevel=true to create an additional case level between
+   * secondary and tertiary
+   */
+  public void testIgnoreAccentsButNotCase() {
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=6]",
+               "//result/doc[2]/int[@name='id'][.=9]"
+     );
+    
+    assertQ("Collated TQ: ",
+        req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=7]",
+               "//result/doc[2]/int[@name='id'][.=8]"
+     );
+  }
+  
+  /*
+   * Setting caseFirst=upper to cause uppercase strings to sort
+   * before lowercase ones.
+   */
+  public void testUpperCaseFirst() {
+    assertQ("Collated sort: ",
+        req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=8]",
+               "//result/doc[2]/int[@name='id'][.=6]"
+     );
+  }
+}

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java Mon Nov 28 11:03:24 2011
@@ -72,7 +72,7 @@ import org.apache.solr.util.plugin.Resou
  * @deprecated use {@link org.apache.solr.schema.CollationField} instead.
  */
 @Deprecated
-public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
   private Collator collator;
 
   public void inform(ResourceLoader loader) {
@@ -171,4 +171,9 @@ public class CollationKeyFilterFactory e
       IOUtils.closeQuietly(input);
     }
   }
+  
+  @Override
+  public Object getMultiTermComponent() {
+    return this;
+  }
 }

Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml Mon Nov 28 11:03:24 2011
@@ -0,0 +1,61 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for CollationKeyFilter (deprecated: use CollationField instead) -->
+
+<schema name="test" version="1.0">
+  <types>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- basic text field -->
+    <fieldtype name="text" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+    
+    <fieldtype name="sort_ar_t" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.CollationKeyFilterFactory" language="ar"/>
+      </analyzer>
+    </fieldtype>
+    
+    <fieldtype name="sort_de_t" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.CollationKeyFilterFactory" language="de" strength="primary"/>
+      </analyzer>
+    </fieldtype>
+  </types>
+
+  <fields>
+    <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+    <field name="text" type="text" indexed="true" stored="false"/>
+    <field name="sort_ar"       type="sort_ar_t"       indexed="true" stored="false" multiValued="false"/>
+    <field name="sort_de"       type="sort_de_t"       indexed="true" stored="false" multiValued="false"/>
+  </fields>
+
+  <defaultSearchField>text</defaultSearchField>
+  <uniqueKey>id</uniqueKey>
+
+  <!-- copy our text to some sort fields with different orders -->
+  <copyField source="text" dest="sort_ar"/>
+  <copyField source="text" dest="sort_de"/>
+</schema>

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java Mon Nov 28 11:03:24 2011
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests {@link CollationKeyFilterFactory} with RangeQueries
+ */
+public class TestCollationKeyRangeQueries extends SolrTestCaseJ4 {
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-basic.xml","schema-collatefilter.xml");
+    // add some docs
+    assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+    assertU(adoc("id", "2", "text", "I WÄ°LL USE TURKÄ°SH CASING"));
+    assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+    assertU(adoc("id", "4", "text", "Töne"));
+    assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKÄ°SH CASING"));
+    assertU(adoc("id", "6", "text", "Testing"));
+    assertU(adoc("id", "7", "text", "Tone"));
+    assertU(adoc("id", "8", "text", "Testing"));
+    assertU(adoc("id", "9", "text", "testing"));
+    assertU(adoc("id", "10", "text", "toene"));
+    assertU(adoc("id", "11", "text", "Tzne"));
+    assertU(adoc("id", "12", "text", "\u0698\u0698"));
+    assertU(commit());
+  }
+  
+  /** 
+   * Test termquery with german DIN 5007-1 primary strength.
+   * In this case, ö is equivalent to o (but not oe) 
+   */
+  public void testBasicTermQuery() {
+    assertQ("Collated TQ: ",
+       req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+              "//*[@numFound='2']",
+              "//result/doc[1]/int[@name='id'][.=4]",
+              "//result/doc[2]/int[@name='id'][.=7]"
+    );
+  }
+  
+  /** 
+   * Test rangequery again with the DIN 5007-1 collator.
+   * We do a range query of tone .. tp, in binary order this
+   * would retrieve nothing due to case and accent differences.
+   */
+  public void testBasicRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+               "//*[@numFound='2']",
+               "//result/doc[1]/int[@name='id'][.=4]",
+               "//result/doc[2]/int[@name='id'][.=7]"
+     );
+  }
+
+  /** 
+   * Test rangequery again with an Arabic collator.
+   * Binary order would normally order U+0633 in this range.
+   */
+  public void testNegativeRangeQuery() {
+    assertQ("Collated RangeQ: ",
+        req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+               "//*[@numFound='0']"
+     );
+  }
+}