You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/28 06:15:52 UTC
svn commit: r1075210 [3/3] - in /lucene/dev/trunk: lucene/
lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/
lucene/contrib/queries/src/java/org/apache/lucene/search/
lucene/contrib/queries/src/test/org/apache/lucene/search/ lucen...
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java?rev=1075210&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/collation/tokenattributes/CollatedTermAttributeImpl.java Mon Feb 28 05:15:50 2011
@@ -0,0 +1,48 @@
+package org.apache.lucene.collation.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.Collator;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Extension of {@link CharTermAttributeImpl} that encodes the term
+ * text as a binary Unicode collation key instead of as UTF-8 bytes.
+ */
+public class CollatedTermAttributeImpl extends CharTermAttributeImpl {
+ private final Collator collator;
+
+ /**
+ * Create a new CollatedTermAttributeImpl
+ * @param collator Collation key generator
+ */
+ public CollatedTermAttributeImpl(Collator collator) {
+ this.collator = collator;
+ }
+
+ @Override
+ public int toBytesRef(BytesRef target) {
+ target.bytes = collator.getCollationKey(toString()).toByteArray();
+ target.offset = 0;
+ target.length = target.bytes.length;
+ return target.hashCode();
+ }
+
+}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Mon Feb 28 05:15:50 2011
@@ -36,6 +36,7 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IndexableBinaryStringTools;
import org.apache.lucene.util.LuceneTestCase;
@@ -56,7 +57,9 @@ public abstract class CollationTestBase
* @param keyBits the result from
* collator.getCollationKey(original).toByteArray()
* @return The encoded collation key for the original String
+ * @deprecated only for testing deprecated filters
*/
+ @Deprecated
protected String encodeCollationKey(byte[] keyBits) {
// Ensure that the backing char[] array is large enough to hold the encoded
// Binary String
@@ -65,10 +68,10 @@ public abstract class CollationTestBase
IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
return new String(encodedBegArray);
}
-
- public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
+
+ public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg,
+ BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@@ -98,9 +101,9 @@ public abstract class CollationTestBase
searcher.close();
}
- public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
+ public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg,
+ BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@@ -126,8 +129,8 @@ public abstract class CollationTestBase
searcher.close();
}
- public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg, String secondEnd) throws Exception {
+ public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
RAMDirectory farsiIndex = new RAMDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Mon Feb 28 05:15:50 2011
@@ -19,6 +19,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
@@ -34,17 +36,19 @@ public class TestCollationKeyAnalyzer ex
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
private Collator collator = Collator.getInstance(new Locale("ar"));
- private Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ private Analyzer analyzer = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
+ private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
+ private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
+ private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+ }
+
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
@@ -65,13 +69,13 @@ public class TestCollationKeyAnalyzer ex
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se")));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and Sun java.text.Collator implementations differ in their
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java Mon Feb 28 05:15:50 2011
@@ -21,12 +21,16 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
import java.io.Reader;
-
+/**
+ * @deprecated remove when CollationKeyFilter is removed.
+ */
+@Deprecated
public class TestCollationKeyFilter extends CollationTestBase {
// the sort order of à versus U depends on the version of the rules being used
// for the inherited root locale: Ã's order isnt specified in Locale.US since
@@ -39,14 +43,14 @@ public class TestCollationKeyFilter exte
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer = new TestAnalyzer(collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+ private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+ private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+ private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
Added: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java?rev=1075210&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java (added)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationAttributeFactory.java Mon Feb 28 05:15:50 2011
@@ -0,0 +1,96 @@
+package org.apache.lucene.collation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.collation.tokenattributes.ICUCollatedTermAttributeImpl;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+
+import com.ibm.icu.text.Collator;
+
+/**
+ * <p>
+ * Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
+ * then encodes bytes as an index term.
+ * </p>
+ * <p>
+ * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
+ * index and query time -- CollationKeys are only comparable when produced by
+ * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
+ * independently versioned, so it is safe to search against stored
+ * CollationKeys if the following are exactly the same (best practice is
+ * to store this information with the index and check that they remain the
+ * same at query time):
+ * </p>
+ * <ol>
+ * <li>
+ * Collator version - see {@link Collator#getVersion()}
+ * </li>
+ * <li>
+ * The collation strength used - see {@link Collator#setStrength(int)}
+ * </li>
+ * </ol>
+ * <p>
+ * CollationKeys generated by ICU Collators are not compatible with those
+ * generated by java.text.Collators. Specifically, if you use
+ * ICUCollationAttributeFactory to generate index terms, do not use
+ * {@link CollationAttributeFactory} on the query side, or vice versa.
+ * </p>
+ * <p>
+ * ICUCollationAttributeFactory is significantly faster and generates significantly
+ * shorter keys than CollationAttributeFactory. See
+ * <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+ * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+ * generation timing and key length comparisons between ICU4J and
+ * java.text.Collator over several languages.
+ * </p>
+ */
+public class ICUCollationAttributeFactory extends AttributeSource.AttributeFactory {
+ private final Collator collator;
+ private final AttributeSource.AttributeFactory delegate;
+
+ /**
+ * Create an ICUCollationAttributeFactory, using
+ * {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY} as the
+ * factory for all other attributes.
+ * @param collator CollationKey generator
+ */
+ public ICUCollationAttributeFactory(Collator collator) {
+ this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator);
+ }
+
+ /**
+ * Create an ICUCollationAttributeFactory, using the supplied Attribute
+ * Factory as the factory for all other attributes.
+ * @param delegate Attribute Factory
+ * @param collator CollationKey generator
+ */
+ public ICUCollationAttributeFactory(AttributeSource.AttributeFactory delegate, Collator collator) {
+ this.delegate = delegate;
+ this.collator = collator;
+ }
+
+ @Override
+ public AttributeImpl createAttributeInstance(
+ Class<? extends Attribute> attClass) {
+ return attClass.isAssignableFrom(ICUCollatedTermAttributeImpl.class)
+ ? new ICUCollatedTermAttributeImpl(collator)
+ : delegate.createAttributeInstance(attClass);
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java Mon Feb 28 05:15:50 2011
@@ -19,24 +19,20 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
-
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
+import org.apache.lucene.util.Version;
import java.io.Reader;
-import java.io.IOException;
-
/**
* <p>
* Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
* <p>
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
- * then encodes the CollationKey with
- * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to
+ * then encodes the CollationKey either directly or with
+ * {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
* be stored as an index term.
* </p>
* <p>
@@ -70,39 +66,48 @@ import java.io.IOException;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ICUCollationKeyAnalyzer:
+ * <ul>
+ * <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ * versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
*/
-public final class ICUCollationKeyAnalyzer extends Analyzer {
- private Collator collator;
-
- public ICUCollationKeyAnalyzer(Collator collator) {
+public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase {
+ private final Collator collator;
+ private final ICUCollationAttributeFactory factory;
+ private final Version matchVersion;
+
+ /**
+ * Create a new ICUCollationKeyAnalyzer, using the specified collator.
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param collator CollationKey generator
+ */
+ public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
+ this.matchVersion = matchVersion;
this.collator = collator;
+ this.factory = new ICUCollationAttributeFactory(collator);
}
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new KeywordTokenizer(reader);
- result = new ICUCollationKeyFilter(result, collator);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
+ /**
+ * @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
+ * and specify a version instead. This ctor will be removed in Lucene 5.0
+ */
+ @Deprecated
+ public ICUCollationKeyAnalyzer(Collator collator) {
+ this(Version.LUCENE_31, collator);
}
-
+
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
-
- SavedStreams streams = (SavedStreams)getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new KeywordTokenizer(reader);
- streams.result = new ICUCollationKeyFilter(streams.source, collator);
- setPreviousTokenStream(streams);
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
} else {
- streams.source.reset(reader);
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
}
- return streams.result;
}
}
Modified: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java Mon Feb 28 05:15:50 2011
@@ -68,7 +68,10 @@ import java.io.IOException;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
+ * @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ * terms directly as bytes. This filter will be removed in Lucene 5.0
*/
+@Deprecated
public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
Added: lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java?rev=1075210&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java (added)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/org/apache/lucene/collation/tokenattributes/ICUCollatedTermAttributeImpl.java Mon Feb 28 05:15:50 2011
@@ -0,0 +1,50 @@
+package org.apache.lucene.collation.tokenattributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+import org.apache.lucene.util.BytesRef;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RawCollationKey;
+
+/**
+ * Extension of {@link CharTermAttributeImpl} that encodes the term
+ * text as a binary Unicode collation key instead of as UTF-8 bytes.
+ */
+public class ICUCollatedTermAttributeImpl extends CharTermAttributeImpl {
+ private final Collator collator;
+ private final RawCollationKey key = new RawCollationKey();
+
+ /**
+ * Create a new ICUCollatedTermAttributeImpl
+ * @param collator Collation key generator
+ */
+ public ICUCollatedTermAttributeImpl(Collator collator) {
+ this.collator = collator;
+ }
+
+ @Override
+ public int toBytesRef(BytesRef target) {
+ collator.getRawCollationKey(toString(), key);
+ target.bytes = key.bytes;
+ target.offset = 0;
+ target.length = key.size;
+ return target.hashCode();
+ }
+}
Modified: lucene/dev/trunk/modules/analysis/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/java/overview.html?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/java/overview.html (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/java/overview.html Mon Feb 28 05:15:50 2011
@@ -112,11 +112,10 @@ algorithm.
<h3>Farsi Range Queries</h3>
<code><pre>
- Collator collator = Collator.getInstance(new Locale("ar"));
- ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+ Collator collator = Collator.getInstance(new ULocale("ar"));
+ ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@@ -124,12 +123,9 @@ algorithm.
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
- // to be passed through an analyzer - Lucene's standard QueryParser does not
- // allow this.
- AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
- aqp.setLowercaseExpandedTerms(false);
-
+ QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+ aqp.setAnalyzeRangeTerms(true);
+
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -143,10 +139,9 @@ algorithm.
<h3>Danish Sorting</h3>
<code><pre>
Analyzer analyzer
- = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -157,7 +152,7 @@ algorithm.
writer.addDocument(doc);
}
writer.close();
- Searcher searcher = new IndexSearcher(indexStore, true);
+ IndexSearcher searcher = new IndexSearcher(indexStore, true);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@@ -170,18 +165,17 @@ algorithm.
<h3>Turkish Case Normalization</h3>
<code><pre>
- Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+ Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+ Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- QueryParser parser = new QueryParser("contents", analyzer);
+ QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
Modified: lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java Mon Feb 28 05:15:50 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
import java.util.Locale;
@@ -27,17 +29,23 @@ import java.util.Locale;
public class TestICUCollationKeyAnalyzer extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
- private Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+ private Analyzer analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
- private String firstRangeBeginning = encodeCollationKey
+ private BytesRef firstRangeBeginning = new BytesRef
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
+ private BytesRef firstRangeEnd = new BytesRef
(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
+ private BytesRef secondRangeBeginning = new BytesRef
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
+ private BytesRef secondRangeEnd = new BytesRef
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+ }
+
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
@@ -62,13 +70,13 @@ public class TestICUCollationKeyAnalyzer
//
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(Locale.US));
+ (TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(Locale.FRANCE));
+ (TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(new Locale("sv", "se")));
+ (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(new Locale("da", "dk")));
+ (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and java.text.Collator implementations differ in their
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
Modified: lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java Mon Feb 28 05:15:50 2011
@@ -22,24 +22,26 @@ import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
import java.io.Reader;
import java.util.Locale;
-
+/** @deprecated remove this when ICUCollationKeyFilter is removed */
+@Deprecated
public class TestICUCollationKeyFilter extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
private Analyzer analyzer = new TestAnalyzer(collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+ private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+ private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+ private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/ShardDoc.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/ShardDoc.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/ShardDoc.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/ShardDoc.java Mon Feb 28 05:15:50 2011
@@ -95,10 +95,10 @@ class ShardFieldSortedHitQueue extends P
String fieldname = fields[i].getField();
comparators[i] = getCachedComparator(fieldname, fields[i]
- .getType(), fields[i].getLocale(), fields[i].getComparatorSource());
+ .getType(), fields[i].getComparatorSource());
if (fields[i].getType() == SortField.STRING) {
- this.fields[i] = new SortField(fieldname, fields[i].getLocale(),
+ this.fields[i] = new SortField(fieldname, SortField.STRING,
fields[i].getReverse());
} else {
this.fields[i] = new SortField(fieldname, fields[i].getType(),
@@ -145,17 +145,14 @@ class ShardFieldSortedHitQueue extends P
return c < 0;
}
- Comparator getCachedComparator(String fieldname, int type, Locale locale, FieldComparatorSource factory) {
+ Comparator getCachedComparator(String fieldname, int type, FieldComparatorSource factory) {
Comparator comparator = null;
switch (type) {
case SortField.SCORE:
comparator = comparatorScore(fieldname);
break;
case SortField.STRING:
- if (locale != null)
- comparator = comparatorStringLocale(fieldname, locale);
- else
- comparator = comparatorNatural(fieldname);
+ comparator = comparatorNatural(fieldname);
break;
case SortField.CUSTOM:
if (factory instanceof MissingStringLastComparatorSource){
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/schema/DateField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/schema/DateField.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/schema/DateField.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/schema/DateField.java Mon Feb 28 05:15:50 2011
@@ -410,7 +410,7 @@ public class DateField extends FieldType
/** DateField specific range query */
public Query getRangeQuery(QParser parser, SchemaField sf, Date part1, Date part2, boolean minInclusive, boolean maxInclusive) {
- return new TermRangeQuery(
+ return TermRangeQuery.newStringRange(
sf.getName(),
part1 == null ? null : toInternal(part1),
part2 == null ? null : toInternal(part2),
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/schema/FieldType.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/schema/FieldType.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/schema/FieldType.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/schema/FieldType.java Mon Feb 28 05:15:50 2011
@@ -521,7 +521,7 @@ public abstract class FieldType extends
*/
public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
// constant score mode is now enabled per default
- return new TermRangeQuery(
+ return TermRangeQuery.newStringRange(
field.getName(),
part1 == null ? null : toInternal(part1),
part2 == null ? null : toInternal(part2),
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/search/QueryParsing.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/search/QueryParsing.java?rev=1075210&r1=1075209&r2=1075210&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/search/QueryParsing.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/search/QueryParsing.java Mon Feb 28 05:15:50 2011
@@ -398,8 +398,8 @@ public class QueryParsing {
String fname = q.getField();
FieldType ft = writeFieldName(fname, schema, out, flags);
out.append(q.includesLower() ? '[' : '{');
- String lt = q.getLowerTerm();
- String ut = q.getUpperTerm();
+ String lt = q.getLowerTerm().utf8ToString();
+ String ut = q.getUpperTerm().utf8ToString();
if (lt == null) {
out.append('*');
} else {