You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/11/28 12:03:27 UTC
svn commit: r1207070 - in /lucene/dev/trunk/solr: ./ contrib/analysis-extras/
contrib/analysis-extras/src/java/org/apache/solr/analysis/
contrib/analysis-extras/src/java/org/apache/solr/schema/
contrib/analysis-extras/src/test-files/analysis-extras/sol...
Author: rmuir
Date: Mon Nov 28 11:03:24 2011
New Revision: 1207070
URL: http://svn.apache.org/viewvc?rev=1207070&view=rev
Log:
SOLR-2919: Localized rangequery support for icu/jdk collation filters, parametric tailoring for icu collators
Added:
lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java (with props)
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java (with props)
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon Nov 28 11:03:24 2011
@@ -395,6 +395,8 @@ New Features
"multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
+* SOLR-2919: Added support for localized range queries when the analysis chain uses
+ CollationKeyFilter or ICUCollationKeyFilter. (Michael Sokolov, rmuir)
Bug Fixes
----------------------
Modified: lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/CHANGES.txt Mon Nov 28 11:03:24 2011
@@ -17,6 +17,13 @@ $Id$
the Solr 3.x ICUCollationKeyFilterFactory, and also supports
Locale-sensitive range queries. (rmuir)
+================== 3.6.0 ==================
+
+* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.
+ These can be used to customize range query/sort behavior, for example to
+ support numeric collation, ignore punctuation/whitespace, ignore accents but
+ not case, control whether upper/lowercase values are sorted first, etc. (rmuir)
+
================== 3.5.0 ==================
(No Changes)
Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java Mon Nov 28 11:03:24 2011
@@ -53,6 +53,15 @@ import com.ibm.icu.util.ULocale;
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
+ * <p>
+ * Expert options:
+ * <ul>
+ * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ * <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
*
* @see Collator
* @see ULocale
@@ -60,7 +69,7 @@ import com.ibm.icu.util.ULocale;
* @deprecated use {@link org.apache.solr.schema.ICUCollationField} instead.
*/
@Deprecated
-public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
@@ -68,6 +77,12 @@ public class ICUCollationKeyFilterFactor
String localeID = args.get("locale");
String strength = args.get("strength");
String decomposition = args.get("decomposition");
+
+ String alternate = args.get("alternate");
+ String caseLevel = args.get("caseLevel");
+ String caseFirst = args.get("caseFirst");
+ String numeric = args.get("numeric");
+ String variableTop = args.get("variableTop");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@@ -110,6 +125,36 @@ public class ICUCollationKeyFilterFactor
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
+
+ // expert options: concrete subclasses are always a RuleBasedCollator
+ RuleBasedCollator rbc = (RuleBasedCollator) collator;
+ if (alternate != null) {
+ if (alternate.equalsIgnoreCase("shifted")) {
+ rbc.setAlternateHandlingShifted(true);
+ } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+ rbc.setAlternateHandlingShifted(false);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+ }
+ }
+ if (caseLevel != null) {
+ rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+ }
+ if (caseFirst != null) {
+ if (caseFirst.equalsIgnoreCase("lower")) {
+ rbc.setLowerCaseFirst(true);
+ } else if (caseFirst.equalsIgnoreCase("upper")) {
+ rbc.setUpperCaseFirst(true);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+ }
+ }
+ if (numeric != null) {
+ rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+ }
+ if (variableTop != null) {
+ rbc.setVariableTop(variableTop);
+ }
}
public TokenStream create(TokenStream input) {
@@ -141,4 +186,9 @@ public class ICUCollationKeyFilterFactor
IOUtils.closeQuietly(input);
}
}
+
+ @Override
+ public Object getMultiTermComponent() {
+ return this;
+ }
}
Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java Mon Nov 28 11:03:24 2011
@@ -66,7 +66,16 @@ import com.ibm.icu.util.ULocale;
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
- *
+ * <p>
+ * Expert options:
+ * <ul>
+ * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ * <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
+ *
* @see Collator
* @see ULocale
* @see RuleBasedCollator
@@ -90,6 +99,12 @@ public class ICUCollationField extends F
String strength = args.remove("strength");
String decomposition = args.remove("decomposition");
+ String alternate = args.remove("alternate");
+ String caseLevel = args.remove("caseLevel");
+ String caseFirst = args.remove("caseFirst");
+ String numeric = args.remove("numeric");
+ String variableTop = args.remove("variableTop");
+
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@@ -133,6 +148,37 @@ public class ICUCollationField extends F
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
+
+ // expert options: concrete subclasses are always a RuleBasedCollator
+ RuleBasedCollator rbc = (RuleBasedCollator) collator;
+ if (alternate != null) {
+ if (alternate.equalsIgnoreCase("shifted")) {
+ rbc.setAlternateHandlingShifted(true);
+ } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+ rbc.setAlternateHandlingShifted(false);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+ }
+ }
+ if (caseLevel != null) {
+ rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+ }
+ if (caseFirst != null) {
+ if (caseFirst.equalsIgnoreCase("lower")) {
+ rbc.setLowerCaseFirst(true);
+ } else if (caseFirst.equalsIgnoreCase("upper")) {
+ rbc.setUpperCaseFirst(true);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+ }
+ }
+ if (numeric != null) {
+ rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+ }
+ if (variableTop != null) {
+ rbc.setVariableTop(variableTop);
+ }
+
// we use 4.0 because it ensures we just encode the pure byte[] keys.
analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
}
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml Mon Nov 28 11:03:24 2011
@@ -0,0 +1,61 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for ICUCollationKeyFilter (deprecated: use ICUCollationField instead) -->
+
+<schema name="test" version="1.0">
+ <types>
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- basic text field -->
+ <fieldtype name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="sort_ar_t" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.ICUCollationKeyFilterFactory" locale="ar"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="sort_de_t" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.ICUCollationKeyFilterFactory" locale="de" strength="primary"/>
+ </analyzer>
+ </fieldtype>
+ </types>
+
+ <fields>
+ <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+ <field name="text" type="text" indexed="true" stored="false"/>
+ <field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
+ <field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
+ </fields>
+
+ <defaultSearchField>text</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+ <!-- copy our text to some sort fields with different orders -->
+ <copyField source="text" dest="sort_ar"/>
+ <copyField source="text" dest="sort_de"/>
+</schema>
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollateoptions.xml Mon Nov 28 11:03:24 2011
@@ -0,0 +1,69 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for CollationField options -->
+
+<schema name="test" version="1.0">
+ <types>
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- basic text field -->
+ <fieldtype name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- ignores punctuation and whitespace -->
+ <fieldtype name="sort_ignore_punctuation_t" class="solr.ICUCollationField"
+ locale="en" strength="primary" alternate="shifted"/>
+ <!-- ignores only whitespace -->
+ <fieldtype name="sort_ignore_space_t" class="solr.ICUCollationField"
+ locale="en" strength="primary" alternate="shifted" variableTop=" "/>
+ <!-- ignores only accents, but not case -->
+ <fieldtype name="sort_ignore_accents_t" class="solr.ICUCollationField"
+ locale="en" strength="primary" caseLevel="true"/>
+ <!-- sorts numerics in numeric order -->
+ <fieldtype name="sort_numerics_t" class="solr.ICUCollationField"
+ locale="en" numeric="true"/>
+ <!-- sorts uppercase before lowercase -->
+ <fieldtype name="sort_uppercase_first_t" class="solr.ICUCollationField"
+ locale="en" strength="tertiary" caseFirst="upper"/>
+ </types>
+
+ <fields>
+ <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+ <field name="text" type="text" indexed="true" stored="false"/>
+ <field name="sort_ignore_punctuation" type="sort_ignore_punctuation_t" indexed="true" stored="false" multiValued="false"/>
+ <field name="sort_ignore_space" type="sort_ignore_space_t" indexed="true" stored="false" multiValued="false"/>
+ <field name="sort_ignore_accents" type="sort_ignore_accents_t" indexed="true" stored="false" multiValued="false"/>
+ <field name="sort_numerics" type="sort_numerics_t" indexed="true" stored="false" multiValued="false"/>
+ <field name="sort_uppercase_first" type="sort_uppercase_first_t" indexed="true" stored="false" multiValued="false"/>
+ </fields>
+
+ <defaultSearchField>text</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+ <!-- copy our text to some sort fields with different orders -->
+ <copyField source="text" dest="sort_ignore_punctuation"/>
+ <copyField source="text" dest="sort_ignore_space"/>
+ <copyField source="text" dest="sort_ignore_accents"/>
+ <copyField source="text" dest="sort_numerics"/>
+ <copyField source="text" dest="sort_uppercase_first"/>
+</schema>
Modified: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java Mon Nov 28 11:03:24 2011
@@ -98,6 +98,133 @@ public class TestICUCollationKeyFilterFa
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
+
+ /*
+ * Setting alternate=shifted to shift whitespace, punctuation and symbols
+ * to quaternary level
+ */
+ public void testIgnorePunctuation() throws IOException {
+ String withPunctuation = "foo-bar";
+ String withoutPunctuation = "foo bar";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "primary");
+ args.put("alternate", "shifted");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsPunctuation = factory.create(
+ new KeywordTokenizer(new StringReader(withPunctuation)));
+ TokenStream tsWithoutPunctuation = factory.create(
+ new KeywordTokenizer(new StringReader(withoutPunctuation)));
+ assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
+ }
+
+ /*
+ * Setting alternate=shifted and variableTop to shift whitespace, but not
+ * punctuation or symbols, to quaternary level
+ */
+ public void testIgnoreWhitespace() throws IOException {
+ String withSpace = "foo bar";
+ String withoutSpace = "foobar";
+ String withPunctuation = "foo-bar";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "primary");
+ args.put("alternate", "shifted");
+ args.put("variableTop", " ");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsWithSpace = factory.create(
+ new KeywordTokenizer(new StringReader(withSpace)));
+ TokenStream tsWithoutSpace = factory.create(
+ new KeywordTokenizer(new StringReader(withoutSpace)));
+ assertCollatesToSame(tsWithSpace, tsWithoutSpace);
+ // now assert that punctuation still matters: foo-bar < foo bar
+ tsWithSpace = factory.create(
+ new KeywordTokenizer(new StringReader(withSpace)));
+ TokenStream tsWithPunctuation = factory.create(
+ new KeywordTokenizer(new StringReader(withPunctuation)));
+ assertCollation(tsWithPunctuation, tsWithSpace, -1);
+ }
+
+ /*
+ * Setting numeric to encode digits with numeric value, so that
+ * foobar-9 sorts before foobar-10
+ */
+ public void testNumerics() throws IOException {
+ String nine = "foobar-9";
+ String ten = "foobar-10";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("numeric", "true");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsNine = factory.create(
+ new KeywordTokenizer(new StringReader(nine)));
+ TokenStream tsTen = factory.create(
+ new KeywordTokenizer(new StringReader(ten)));
+ assertCollation(tsNine, tsTen, -1);
+ }
+
+ /*
+ * Setting caseLevel=true to create an additional case level between
+ * secondary and tertiary
+ */
+ public void testIgnoreAccentsButNotCase() throws IOException {
+ String withAccents = "résumé";
+ String withoutAccents = "resume";
+ String withAccentsUpperCase = "Résumé";
+ String withoutAccentsUpperCase = "Resume";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "primary");
+ args.put("caseLevel", "true");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsWithAccents = factory.create(
+ new KeywordTokenizer(new StringReader(withAccents)));
+ TokenStream tsWithoutAccents = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccents)));
+ assertCollatesToSame(tsWithAccents, tsWithoutAccents);
+
+ TokenStream tsWithAccentsUpperCase = factory.create(
+ new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
+ TokenStream tsWithoutAccentsUpperCase = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+ assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
+
+ // now assert that case still matters: resume < Resume
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccents)));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+ assertCollation(tsLower, tsUpper, -1);
+ }
+
+ /*
+ * Setting caseFirst=upper to cause uppercase strings to sort
+ * before lowercase ones.
+ */
+ public void testUpperCaseFirst() throws IOException {
+ String lower = "resume";
+ String upper = "Resume";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "tertiary");
+ args.put("caseFirst", "upper");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(lower)));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(upper)));
+ assertCollation(tsUpper, tsLower, -1);
+ }
/*
* For german, you might want oe to sort and match with o umlaut.
@@ -156,15 +283,18 @@ public class TestICUCollationKeyFilterFa
}
}
- private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
- throws IOException {
+ private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
+ assertCollation(stream1, stream2, 0);
+ }
+
+ private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());
- assertEquals(term1.toString(), term2.toString());
+ assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
assertFalse(stream1.incrementToken());
assertFalse(stream2.incrementToken());
}
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java Mon Nov 28 11:03:24 2011
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests {@link ICUCollationKeyFilterFactory} with RangeQueries
+ */
+public class TestICUCollationKeyRangeQueries extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-icucollate.xml","schema-icucollatefilter.xml", "analysis-extras/solr");
+ // add some docs
+ assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+ assertU(adoc("id", "2", "text", "I WÄ°LL USE TURKÄ°SH CASING"));
+ assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+ assertU(adoc("id", "4", "text", "Töne"));
+ assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKÄ°SH CASING"));
+ assertU(adoc("id", "6", "text", "ï¼´ï½
ï½ï½ï½ï½ï½"));
+ assertU(adoc("id", "7", "text", "Tone"));
+ assertU(adoc("id", "8", "text", "Testing"));
+ assertU(adoc("id", "9", "text", "testing"));
+ assertU(adoc("id", "10", "text", "toene"));
+ assertU(adoc("id", "11", "text", "Tzne"));
+ assertU(adoc("id", "12", "text", "\u0698\u0698"));
+ assertU(commit());
+ }
+
+ /**
+ * Test termquery with german DIN 5007-1 primary strength.
+ * In this case, ö is equivalent to o (but not oe)
+ */
+ public void testBasicTermQuery() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=4]",
+ "//result/doc[2]/int[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test rangequery again with the DIN 5007-1 collator.
+ * We do a range query of tone .. tp, in binary order this
+ * would retrieve nothing due to case and accent differences.
+ */
+ public void testBasicRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=4]",
+ "//result/doc[2]/int[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test rangequery again with an Arabic collator.
+ * Binary order would normally order U+0633 in this range.
+ */
+ public void testNegativeRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+ "//*[@numFound='0']"
+ );
+ }
+}
Added: lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java (added)
+++ lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java Mon Nov 28 11:03:24 2011
@@ -0,0 +1,117 @@
+package org.apache.solr.schema;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests expert options of {@link ICUCollationField}.
+ */
+public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ assumeFalse("preflex format only supports UTF-8 encoded bytes", "Lucene3x".equals(Codec.getDefault().getName()));
+ initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml", "analysis-extras/solr");
+ // add some docs
+ assertU(adoc("id", "1", "text", "foo-bar"));
+ assertU(adoc("id", "2", "text", "foo bar"));
+ assertU(adoc("id", "3", "text", "foobar"));
+ assertU(adoc("id", "4", "text", "foobar-10"));
+ assertU(adoc("id", "5", "text", "foobar-9"));
+ assertU(adoc("id", "6", "text", "resume"));
+ assertU(adoc("id", "7", "text", "Résumé"));
+ assertU(adoc("id", "8", "text", "Resume"));
+ assertU(adoc("id", "9", "text", "résumé"));
+ assertU(commit());
+ }
+
+ /*
+ * Setting alternate=shifted to shift whitespace, punctuation and symbols
+ * to quaternary level
+ */
+ public void testIgnorePunctuation() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id asc" ),
+ "//*[@numFound='3']",
+ "//result/doc[1]/int[@name='id'][.=1]",
+ "//result/doc[2]/int[@name='id'][.=2]",
+ "//result/doc[3]/int[@name='id'][.=3]"
+ );
+ }
+
+ /*
+ * Setting alternate=shifted and variableTop to shift whitespace, but not
+ * punctuation or symbols, to quaternary level
+ */
+ public void testIgnoreWhitespace() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=2]",
+ "//result/doc[2]/int[@name='id'][.=3]"
+ );
+ }
+
+ /*
+ * Setting numeric to encode digits with numeric value, so that
+ * foobar-9 sorts before foobar-10
+ */
+ public void testNumerics() {
+ assertQ("Collated sort: ",
+ req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=5]",
+ "//result/doc[2]/int[@name='id'][.=4]"
+ );
+ }
+
+ /*
+ * Setting caseLevel=true to create an additional case level between
+ * secondary and tertiary
+ */
+ public void testIgnoreAccentsButNotCase() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=6]",
+ "//result/doc[2]/int[@name='id'][.=9]"
+ );
+
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=7]",
+ "//result/doc[2]/int[@name='id'][.=8]"
+ );
+ }
+
+ /*
+ * Setting caseFirst=upper to cause uppercase strings to sort
+ * before lowercase ones.
+ */
+ public void testUpperCaseFirst() {
+ assertQ("Collated sort: ",
+ req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=8]",
+ "//result/doc[2]/int[@name='id'][.=6]"
+ );
+ }
+}
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java?rev=1207070&r1=1207069&r2=1207070&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java Mon Nov 28 11:03:24 2011
@@ -72,7 +72,7 @@ import org.apache.solr.util.plugin.Resou
* @deprecated use {@link org.apache.solr.schema.CollationField} instead.
*/
@Deprecated
-public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
@@ -171,4 +171,9 @@ public class CollationKeyFilterFactory e
IOUtils.closeQuietly(input);
}
}
+
+ @Override
+ public Object getMultiTermComponent() {
+ return this;
+ }
}
Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml Mon Nov 28 11:03:24 2011
@@ -0,0 +1,61 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for CollationKeyFilter (deprecated: use CollationField instead) -->
+
+<schema name="test" version="1.0">
+ <types>
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- basic text field -->
+ <fieldtype name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="sort_ar_t" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.CollationKeyFilterFactory" language="ar"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="sort_de_t" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.CollationKeyFilterFactory" language="de" strength="primary"/>
+ </analyzer>
+ </fieldtype>
+ </types>
+
+ <fields>
+ <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+ <field name="text" type="text" indexed="true" stored="false"/>
+ <field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
+ <field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
+ </fields>
+
+ <defaultSearchField>text</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+ <!-- copy our text to some sort fields with different orders -->
+ <copyField source="text" dest="sort_ar"/>
+ <copyField source="text" dest="sort_de"/>
+</schema>
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java?rev=1207070&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java Mon Nov 28 11:03:24 2011
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+/**
+ * Tests {@link CollationKeyFilterFactory} with RangeQueries
+ */
+public class TestCollationKeyRangeQueries extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-basic.xml","schema-collatefilter.xml");
+ // add some docs
+ assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+ assertU(adoc("id", "2", "text", "I WÄ°LL USE TURKÄ°SH CASING"));
+ assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+ assertU(adoc("id", "4", "text", "Töne"));
+ assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKÄ°SH CASING"));
+ assertU(adoc("id", "6", "text", "ï¼´ï½
ï½ï½ï½ï½ï½"));
+ assertU(adoc("id", "7", "text", "Tone"));
+ assertU(adoc("id", "8", "text", "Testing"));
+ assertU(adoc("id", "9", "text", "testing"));
+ assertU(adoc("id", "10", "text", "toene"));
+ assertU(adoc("id", "11", "text", "Tzne"));
+ assertU(adoc("id", "12", "text", "\u0698\u0698"));
+ assertU(commit());
+ }
+
+ /**
+ * Test termquery with german DIN 5007-1 primary strength.
+ * In this case, ö is equivalent to o (but not oe)
+ */
+ public void testBasicTermQuery() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=4]",
+ "//result/doc[2]/int[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test rangequery again with the DIN 5007-1 collator.
+ * We do a range query of tone .. tp, in binary order this
+ * would retrieve nothing due to case and accent differences.
+ */
+ public void testBasicRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/int[@name='id'][.=4]",
+ "//result/doc[2]/int[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test rangequery again with an Arabic collator.
+ * Binary order would normally order U+0633 in this range.
+ */
+ public void testNegativeRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+ "//*[@numFound='0']"
+ );
+ }
+}