You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/11/27 22:34:15 UTC
svn commit: r885024 - in /lucene/java/trunk: ./
contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/
contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/ contrib/a...
Author: rmuir
Date: Fri Nov 27 21:34:11 2009
New Revision: 885024
URL: http://svn.apache.org/viewvc?rev=885024&view=rev
Log:
LUCENE-2069: supplementary char support for lowercasefilter
Added:
lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java
lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java
lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java
lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java
lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Nov 27 21:34:11 2009
@@ -25,6 +25,12 @@
New features
+* LUCENE-2069: Added Unicode 4 support to LowerCaseFilter. Due to the switch
+ to Java 5, supplementary characters are now lowercased correctly.
+ LowerCaseFilter now requires a Version argument to preserve
+ backwards compatibility. If Version < 3.1 is passed to the constructor,
+ LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)
+
Optimizations
* LUCENE-2086: When resolving deleted terms, do so in term sort order
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -168,7 +168,7 @@
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer( reader );
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
@@ -198,7 +198,7 @@
if (streams == null) {
streams = new SavedStreams();
streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new LowerCaseFilter(matchVersion, streams.source);
// the order here is important: the stopword list is not normalized!
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -199,7 +199,7 @@
@Override
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer( matchVersion, reader );
- result = new LowerCaseFilter( result );
+ result = new LowerCaseFilter( matchVersion, result );
result = new StandardFilter( result );
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
@@ -227,7 +227,7 @@
if (streams == null) {
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
- streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new StandardFilter(streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -181,7 +181,7 @@
public final TokenStream tokenStream( String fieldName, Reader reader ) {
TokenStream result = new StandardTokenizer( matchVersion, reader );
result = new StandardFilter( result );
- result = new LowerCaseFilter( result );
+ result = new LowerCaseFilter( matchVersion, result );
result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable );
return result;
@@ -207,7 +207,7 @@
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
setPreviousTokenStream(streams);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -200,7 +200,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
result = new GermanStemFilter(result, exclusionSet);
@@ -234,7 +234,7 @@
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new LowerCaseFilter(matchVersion, streams.result);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
streams.result = new GermanStemFilter(streams.result, exclusionSet);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -167,7 +167,7 @@
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new ArabicLetterTokenizer(reader);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
@@ -201,7 +201,7 @@
if (streams == null) {
streams = new SavedStreams();
streams.source = new ArabicLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new ArabicNormalizationFilter(streams.result);
/* additional persian-specific normalization */
streams.result = new PersianNormalizationFilter(streams.result);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -215,7 +215,7 @@
result, stoptable);
result = new FrenchStemFilter(result, excltable);
// Convert to lowercase after stemming!
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
return result;
}
@@ -244,7 +244,7 @@
streams.result, stoptable);
streams.result = new FrenchStemFilter(streams.result, excltable);
// Convert to lowercase after stemming!
- streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new LowerCaseFilter(matchVersion, streams.result);
setPreviousTokenStream(streams);
} else {
streams.source.reset(reader);
Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -118,7 +118,7 @@
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new RussianLetterTokenizer(reader);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
result = new RussianStemFilter(result);
@@ -146,7 +146,7 @@
if (streams == null) {
streams = new SavedStreams();
streams.source = new RussianLetterTokenizer(reader);
- streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new LowerCaseFilter(matchVersion, streams.source);
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
streams.result = new RussianStemFilter(streams.result);
Modified: lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java (original)
+++ lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java Fri Nov 27 21:34:11 2009
@@ -158,7 +158,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
result = new TestFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
return result;
}
}
@@ -228,7 +228,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
result = new TestPosIncrementFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
return result;
}
}
Modified: lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java (original)
+++ lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java Fri Nov 27 21:34:11 2009
@@ -152,7 +152,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
result = new TestFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
return result;
}
}
@@ -222,7 +222,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
result = new TestPosIncrementFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
return result;
}
}
Modified: lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (original)
+++ lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -60,7 +60,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
if (stopSet != null)
result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
result, stopSet);
@@ -91,7 +91,7 @@
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new StandardFilter(streams.source);
- streams.result = new LowerCaseFilter(streams.result);
+ streams.result = new LowerCaseFilter(matchVersion, streams.result);
if (stopSet != null)
streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stopSet);
Modified: lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java (original)
+++ lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java Fri Nov 27 21:34:11 2009
@@ -29,6 +29,7 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
File dataDir = new File(System.getProperty("dataDir", "./bin"));
@@ -96,7 +97,7 @@
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream ts = new WhitespaceTokenizer(reader);
- ts = new LowerCaseFilter(ts);
+ ts = new LowerCaseFilter(Version.LUCENE_CURRENT, ts);
ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
return ts;
}
@@ -113,7 +114,7 @@
if (streams == null) {
streams = new SavedStreams();
streams.source = new WhitespaceTokenizer(reader);
- streams.result = new LowerCaseFilter(streams.source);
+ streams.result = new LowerCaseFilter(Version.LUCENE_CURRENT, streams.source);
streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
setPreviousTokenStream(streams);
} else {
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java Fri Nov 27 21:34:11 2009
@@ -20,14 +20,38 @@
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.CharacterUtils;
+import org.apache.lucene.util.Version;
/**
* Normalizes token text to lower case.
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating LowerCaseFilter:
+ * <ul>
+ * <li> As of 3.1, supplementary characters are properly lowercased.
+ * </ul>
*/
public final class LowerCaseFilter extends TokenFilter {
- public LowerCaseFilter(TokenStream in) {
+ private final CharacterUtils charUtils;
+
+ /**
+ * Create a new LowerCaseFilter, that normalizes token text to lower case.
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param in TokenStream to filter
+ */
+ public LowerCaseFilter(Version matchVersion, TokenStream in) {
super(in);
termAtt = addAttribute(TermAttribute.class);
+ charUtils = CharacterUtils.getInstance(matchVersion);
+ }
+
+ /**
+ * @deprecated Use {@link #LowerCaseFilter(Version, TokenStream)} instead.
+ */
+ public LowerCaseFilter(TokenStream in) {
+ this(Version.LUCENE_30, in);
}
private TermAttribute termAtt;
@@ -35,12 +59,13 @@
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
-
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
- for(int i=0;i<length;i++)
- buffer[i] = Character.toLowerCase(buffer[i]);
-
+ for (int i = 0; i < length;) {
+ i += Character.toChars(
+ Character.toLowerCase(
+ charUtils.codePointAt(buffer, i)), buffer, i);
+ }
return true;
} else
return false;
Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -100,7 +100,7 @@
StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(enableStopPositionIncrements, result, stopSet);
return result;
}
@@ -146,7 +146,8 @@
setPreviousTokenStream(streams);
streams.tokenStream = new StandardTokenizer(matchVersion, reader);
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
- streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+ streams.filteredTokenStream = new LowerCaseFilter(matchVersion,
+ streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
streams.filteredTokenStream, stopSet);
} else {
Added: lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java?rev=885024&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java Fri Nov 27 21:34:11 2009
@@ -0,0 +1,114 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * {@link CharacterUtils} provides a unified interface to Character-related
+ * operations to implement backwards compatible character operations based on a
+ * {@link Version} instance.
+ */
+public abstract class CharacterUtils {
+ private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
+ private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
+
+ /**
+ * Returns a {@link CharacterUtils} implementation according to the given
+ * {@link Version} instance.
+ *
+ * @param matchVersion
+ * a version instance
+ * @return a {@link CharacterUtils} implementation according to the given
+ * {@link Version} instance.
+ */
+ public static CharacterUtils getInstance(Version matchVersion) {
+ return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
+ }
+
+ /**
+ * Returns the code point at the given index of the char array.
+ * Depending on the {@link Version} passed to
+ * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
+ * of {@link Character#codePointAt(char[], int)} as it would have been
+ * available on a Java 1.4 JVM or on a later virtual machine version.
+ *
+ * @param chars
+ * a character array
+ * @param offset
+ * the offset to the char values in the chars array to be converted
+ *
+ * @return the Unicode code point at the given index
+ * @throws NullPointerException
+ * - if the array is null.
+ * @throws IndexOutOfBoundsException
+ * - if the value offset is negative or not less than the length of
+ * the char array.
+ */
+ public abstract int codePointAt(char[] chars, int offset);
+
+ /**
+ * Returns the code point at the given index of the {@link CharSequence}.
+ * Depending on the {@link Version} passed to
+ * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
+ * of {@link Character#codePointAt(char[], int)} as it would have been
+ * available on a Java 1.4 JVM or on a later virtual machine version.
+ *
+ * @param seq
+ * a character sequence
+ * @param offset
+ * the offset to the char values in the chars array to be converted
+ *
+ * @return the Unicode code point at the given index
+ * @throws NullPointerException
+ * - if the sequence is null.
+ * @throws IndexOutOfBoundsException
+ * - if the value offset is negative or not less than the length of
+ * the character sequence.
+ */
+ public abstract int codePointAt(CharSequence seq, int offset);
+
+ private static final class Java5CharacterUtils extends CharacterUtils {
+ Java5CharacterUtils() {
+ };
+
+ @Override
+ public final int codePointAt(char[] chars, int offset) {
+ return Character.codePointAt(chars, offset);
+ }
+
+ @Override
+ public int codePointAt(CharSequence seq, int offset) {
+ return Character.codePointAt(seq, offset);
+ }
+ }
+
+ private static final class Java4CharacterUtils extends CharacterUtils {
+ Java4CharacterUtils() {
+ };
+
+ @Override
+ public final int codePointAt(char[] chars, int offset) {
+ return chars[offset];
+ }
+
+ @Override
+ public int codePointAt(CharSequence seq, int offset) {
+ return seq.charAt(offset);
+ }
+ }
+
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java Fri Nov 27 21:34:11 2009
@@ -139,6 +139,99 @@
assertTrue(ts.incrementToken());
assertFalse(ts.incrementToken());
}
+
+ private static class LowerCaseWhitespaceAnalyzer extends Analyzer {
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new LowerCaseFilter(Version.LUCENE_CURRENT,
+ new WhitespaceTokenizer(reader));
+ }
+
+ }
+
+ /**
+ * @deprecated remove this when lucene 3.0 "broken unicode 4" support
+ * is no longer needed.
+ */
+ private static class LowerCaseWhitespaceAnalyzerBWComp extends Analyzer {
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return new LowerCaseFilter(new WhitespaceTokenizer(reader));
+ }
+
+ }
+
+ /**
+ * Test that LowercaseFilter handles entire unicode range correctly
+ */
+ public void testLowerCaseFilter() throws IOException {
+ Analyzer a = new LowerCaseWhitespaceAnalyzer();
+ // BMP
+ assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
+ // supplementary
+ assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
+ new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"});
+ assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
+ new String[] { "abaca\ud801\udc3edaba" });
+ // unpaired lead surrogate
+ assertAnalyzesTo(a, "AbaC\uD801AdaBa",
+ new String [] { "abac\uD801adaba" });
+ // unpaired trail surrogate
+ assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
+ new String [] { "abac\uDC16adaba" });
+ }
+
+ /**
+ * Test that LowercaseFilter handles the lowercasing correctly if the term
+ * buffer has a trailing surrogate character leftover and the current term in
+ * the buffer ends with a corresponding leading surrogate.
+ */
+ public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
+ // test if the limit of the termbuffer is correctly used with supplementary
+ // chars
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
+ "BogustermBogusterm\udc16"));
+ LowerCaseFilter filter = new LowerCaseFilter(Version.LUCENE_CURRENT,
+ tokenizer);
+ assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
+ filter.reset();
+ String highSurEndingUpper = "BogustermBoguster\ud801";
+ String highSurEndingLower = "bogustermboguster\ud801";
+ tokenizer.reset(new StringReader(highSurEndingUpper));
+ assertTokenStreamContents(filter, new String[] {highSurEndingLower});
+ assertTrue(filter.hasAttribute(TermAttribute.class));
+ char[] termBuffer = filter.getAttribute(TermAttribute.class).termBuffer();
+ int length = highSurEndingLower.length();
+ assertEquals('\ud801', termBuffer[length - 1]);
+ assertEquals('\udc3e', termBuffer[length]);
+
+ }
+
+ /**
+ * Test that LowercaseFilter only works on BMP for back compat,
+ * depending upon version
+ * @deprecated remove this test when lucene 3.0 "broken unicode 4" support
+ * is no longer needed.
+ */
+ public void testLowerCaseFilterBWComp() throws IOException {
+ Analyzer a = new LowerCaseWhitespaceAnalyzerBWComp();
+ // BMP
+ assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
+ // supplementary, no-op
+ assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
+ new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
+ assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
+ new String[] { "abaca\ud801\udc16daba" });
+ // unpaired lead surrogate
+ assertAnalyzesTo(a, "AbaC\uD801AdaBa",
+ new String [] { "abac\uD801adaba" });
+ // unpaired trail surrogate
+ assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
+ new String [] { "abac\uDC16adaba" });
+ }
+
}
class PayloadSetter extends TokenFilter {
Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java Fri Nov 27 21:34:11 2009
@@ -146,7 +146,7 @@
assertEquals("there must be 2 times 'Dog' in the stream", 2, i);
source1.reset();
- TokenStream lowerCasing = new LowerCaseFilter(source1);
+ TokenStream lowerCasing = new LowerCaseFilter(Version.LUCENE_CURRENT, source1);
i = 0;
termAtt = lowerCasing.getAttribute(TermAttribute.class);
while (lowerCasing.incrementToken()) {
Modified: lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -138,7 +138,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
result = new TestFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
return result;
}
}
@@ -206,7 +206,7 @@
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
result = new TestPosIncrementFilter(result);
- result = new LowerCaseFilter(result);
+ result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
return result;
}
}