You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/11/28 12:22:17 UTC
svn commit: r1207084 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/backwards/src/ lucene/backwards/src/test-framework/
lucene/backwards/src/test/ solr/ solr/contrib/analysis-extras/
solr/contrib/analysis-extras/src/java/org/apache/solr/analysi...
Author: rmuir
Date: Mon Nov 28 11:22:16 2011
New Revision: 1207084
URL: http://svn.apache.org/viewvc?rev=1207084&view=rev
Log:
SOLR-2919: Localized rangequery support for icu/jdk collation filters, parametric tailoring for icu collators
Added:
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml
- copied, changed from r1207070, lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/solrconfig-icucollate.xml (props changed)
- copied unchanged from r1207078, lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/solrconfig-icucollate.xml
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java
- copied unchanged from r1207070, lucene/dev/trunk/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyRangeQueries.java
lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml
- copied, changed from r1207070, lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml
lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/solrconfig-basic.xml (props changed)
- copied unchanged from r1207077, lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-basic.xml
lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java
- copied unchanged from r1207070, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestCollationKeyRangeQueries.java
Removed:
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/empty
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/test/ (props changed)
lucene/dev/branches/branch_3x/lucene/backwards/src/test-framework/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
lucene/dev/branches/branch_3x/solr/solrj/ (props changed)
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Mon Nov 28 11:22:16 2011
@@ -390,6 +390,8 @@ New Features
a complete analysis chain for multiterm queries.
(Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
+* SOLR-2919: Added support for localized range queries when the analysis chain uses
+ CollationKeyFilter or ICUCollationKeyFilter. (Michael Sokolov, rmuir)
Bug Fixes
----------------------
Modified: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/CHANGES.txt Mon Nov 28 11:22:16 2011
@@ -15,6 +15,13 @@ $Id$
(No Changes)
+================== 3.6.0 ==================
+
+* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.
+ These can be used to customize range query/sort behavior, for example to
+ support numeric collation, ignore punctuation/whitespace, ignore accents but
+ not case, control whether upper/lowercase values are sorted first, etc. (rmuir)
+
================== 3.5.0 ==================
(No Changes)
Modified: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUCollationKeyFilterFactory.java Mon Nov 28 11:22:16 2011
@@ -53,12 +53,21 @@ import com.ibm.icu.util.ULocale;
* <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
* <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
+ * <p>
+ * Expert options:
+ * <ul>
+ * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ * <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
*
* @see Collator
* @see ULocale
* @see RuleBasedCollator
*/
-public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class ICUCollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
@@ -66,6 +75,12 @@ public class ICUCollationKeyFilterFactor
String localeID = args.get("locale");
String strength = args.get("strength");
String decomposition = args.get("decomposition");
+
+ String alternate = args.get("alternate");
+ String caseLevel = args.get("caseLevel");
+ String caseFirst = args.get("caseFirst");
+ String numeric = args.get("numeric");
+ String variableTop = args.get("variableTop");
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
@@ -108,6 +123,36 @@ public class ICUCollationKeyFilterFactor
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
}
+
+ // expert options: concrete subclasses are always a RuleBasedCollator
+ RuleBasedCollator rbc = (RuleBasedCollator) collator;
+ if (alternate != null) {
+ if (alternate.equalsIgnoreCase("shifted")) {
+ rbc.setAlternateHandlingShifted(true);
+ } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+ rbc.setAlternateHandlingShifted(false);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+ }
+ }
+ if (caseLevel != null) {
+ rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+ }
+ if (caseFirst != null) {
+ if (caseFirst.equalsIgnoreCase("lower")) {
+ rbc.setLowerCaseFirst(true);
+ } else if (caseFirst.equalsIgnoreCase("upper")) {
+ rbc.setUpperCaseFirst(true);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+ }
+ }
+ if (numeric != null) {
+ rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+ }
+ if (variableTop != null) {
+ rbc.setVariableTop(variableTop);
+ }
}
public TokenStream create(TokenStream input) {
@@ -139,4 +184,9 @@ public class ICUCollationKeyFilterFactor
IOUtils.closeQuietly(input);
}
}
+
+ //@Override
+ public Object getMultiTermComponent() {
+ return this;
+ }
}
Copied: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (from r1207070, lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml?p2=lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml&p1=lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml&r1=1207070&r2=1207084&rev=1207084&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/conf/schema-icucollatefilter.xml Mon Nov 28 11:22:16 2011
@@ -16,7 +16,7 @@
limitations under the License.
-->
-<!-- Test schema file for ICUCollationKeyFilter (deprecated: use ICUCollationField instead) -->
+<!-- Test schema file for ICUCollationKeyFilter -->
<schema name="test" version="1.0">
<types>
Modified: lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUCollationKeyFilterFactory.java Mon Nov 28 11:22:16 2011
@@ -97,6 +97,133 @@ public class TestICUCollationKeyFilterFa
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
+
+ /*
+ * Setting alternate=shifted to shift whitespace, punctuation and symbols
+ * to quaternary level
+ */
+ public void testIgnorePunctuation() throws IOException {
+ String withPunctuation = "foo-bar";
+ String withoutPunctuation = "foo bar";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "primary");
+ args.put("alternate", "shifted");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsPunctuation = factory.create(
+ new KeywordTokenizer(new StringReader(withPunctuation)));
+ TokenStream tsWithoutPunctuation = factory.create(
+ new KeywordTokenizer(new StringReader(withoutPunctuation)));
+ assertCollatesToSame(tsPunctuation, tsWithoutPunctuation);
+ }
+
+ /*
+ * Setting alternate=shifted and variableTop to shift whitespace, but not
+ * punctuation or symbols, to quaternary level
+ */
+ public void testIgnoreWhitespace() throws IOException {
+ String withSpace = "foo bar";
+ String withoutSpace = "foobar";
+ String withPunctuation = "foo-bar";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "primary");
+ args.put("alternate", "shifted");
+ args.put("variableTop", " ");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsWithSpace = factory.create(
+ new KeywordTokenizer(new StringReader(withSpace)));
+ TokenStream tsWithoutSpace = factory.create(
+ new KeywordTokenizer(new StringReader(withoutSpace)));
+ assertCollatesToSame(tsWithSpace, tsWithoutSpace);
+ // now assert that punctuation still matters: foo-bar < foo bar
+ tsWithSpace = factory.create(
+ new KeywordTokenizer(new StringReader(withSpace)));
+ TokenStream tsWithPunctuation = factory.create(
+ new KeywordTokenizer(new StringReader(withPunctuation)));
+ assertCollation(tsWithPunctuation, tsWithSpace, -1);
+ }
+
+ /*
+ * Setting numeric to encode digits with numeric value, so that
+ * foobar-9 sorts before foobar-10
+ */
+ public void testNumerics() throws IOException {
+ String nine = "foobar-9";
+ String ten = "foobar-10";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("numeric", "true");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsNine = factory.create(
+ new KeywordTokenizer(new StringReader(nine)));
+ TokenStream tsTen = factory.create(
+ new KeywordTokenizer(new StringReader(ten)));
+ assertCollation(tsNine, tsTen, -1);
+ }
+
+ /*
+ * Setting caseLevel=true to create an additional case level between
+ * secondary and tertiary
+ */
+ public void testIgnoreAccentsButNotCase() throws IOException {
+ String withAccents = "résumé";
+ String withoutAccents = "resume";
+ String withAccentsUpperCase = "Résumé";
+ String withoutAccentsUpperCase = "Resume";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "primary");
+ args.put("caseLevel", "true");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsWithAccents = factory.create(
+ new KeywordTokenizer(new StringReader(withAccents)));
+ TokenStream tsWithoutAccents = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccents)));
+ assertCollatesToSame(tsWithAccents, tsWithoutAccents);
+
+ TokenStream tsWithAccentsUpperCase = factory.create(
+ new KeywordTokenizer(new StringReader(withAccentsUpperCase)));
+ TokenStream tsWithoutAccentsUpperCase = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+ assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase);
+
+ // now assert that case still matters: resume < Resume
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccents)));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(withoutAccentsUpperCase)));
+ assertCollation(tsLower, tsUpper, -1);
+ }
+
+ /*
+ * Setting caseFirst=upper to cause uppercase strings to sort
+ * before lowercase ones.
+ */
+ public void testUpperCaseFirst() throws IOException {
+ String lower = "resume";
+ String upper = "Resume";
+ ICUCollationKeyFilterFactory factory = new ICUCollationKeyFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("locale", "en");
+ args.put("strength", "tertiary");
+ args.put("caseFirst", "upper");
+ factory.init(args);
+ factory.inform(new StringMockSolrResourceLoader(""));
+ TokenStream tsLower = factory.create(
+ new KeywordTokenizer(new StringReader(lower)));
+ TokenStream tsUpper = factory.create(
+ new KeywordTokenizer(new StringReader(upper)));
+ assertCollation(tsUpper, tsLower, -1);
+ }
/*
* For german, you might want oe to sort and match with o umlaut.
@@ -155,15 +282,18 @@ public class TestICUCollationKeyFilterFa
}
}
- private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
- throws IOException {
+ private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
+ assertCollation(stream1, stream2, 0);
+ }
+
+ private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
CharTermAttribute term1 = stream1
.addAttribute(CharTermAttribute.class);
CharTermAttribute term2 = stream2
.addAttribute(CharTermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());
- assertEquals(term1.toString(), term2.toString());
+ assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
assertFalse(stream1.incrementToken());
assertFalse(stream2.incrementToken());
}
Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java?rev=1207084&r1=1207083&r2=1207084&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java Mon Nov 28 11:22:16 2011
@@ -70,7 +70,7 @@ import org.apache.solr.util.plugin.Resou
* @see RuleBasedCollator
* @since solr 3.1
*/
-public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements MultiTermAwareComponent,ResourceLoaderAware {
private Collator collator;
public void inform(ResourceLoader loader) {
@@ -169,4 +169,9 @@ public class CollationKeyFilterFactory e
IOUtils.closeQuietly(input);
}
}
+
+ //@Override
+ public Object getMultiTermComponent() {
+ return this;
+ }
}
Copied: lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (from r1207070, lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml?p2=lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml&p1=lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml&r1=1207070&r2=1207084&rev=1207084&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-collatefilter.xml (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test-files/solr/conf/schema-collatefilter.xml Mon Nov 28 11:22:16 2011
@@ -16,7 +16,7 @@
limitations under the License.
-->
-<!-- Test schema file for CollationKeyFilter (deprecated: use CollationField instead) -->
+<!-- Test schema file for CollationKeyFilter -->
<schema name="test" version="1.0">
<types>