You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/11/27 22:34:15 UTC

svn commit: r885024 - in /lucene/java/trunk: ./ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/ contrib/a...

Author: rmuir
Date: Fri Nov 27 21:34:11 2009
New Revision: 885024

URL: http://svn.apache.org/viewvc?rev=885024&view=rev
Log:
LUCENE-2069: supplementary char support for lowercasefilter

Added:
    lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java   (with props)
Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
    lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java
    lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java
    lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
    lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java
    lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Nov 27 21:34:11 2009
@@ -25,6 +25,12 @@
 
 New features
 
+* LUCENE-2069: Added Unicode 4 support to LowerCaseFilter. Due to the switch
+  to Java 5, supplementary characters are now lowercased correctly.
+  LowerCaseFilter now requires a Version argument to preserve 
+  backwards compatibility. If Version < 3.1 is passed to the constructor, 
+  LowerCaseFilter yields the old behavior. (Simon Willnauer, Robert Muir)  
+
 Optimizations
 
 * LUCENE-2086: When resolving deleted terms, do so in term sort order

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -168,7 +168,7 @@
   @Override
   public final TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer( reader );
-    result = new LowerCaseFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
     // the order here is important: the stopword list is not normalized!
     result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                              result, stoptable );
@@ -198,7 +198,7 @@
     if (streams == null) {
       streams = new SavedStreams();
       streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new LowerCaseFilter(matchVersion, streams.source);
       // the order here is important: the stopword list is not normalized!
       streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                       streams.result, stoptable);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -199,7 +199,7 @@
 	@Override
 	public final TokenStream tokenStream(String fieldName, Reader reader) {
                 TokenStream result = new StandardTokenizer( matchVersion, reader );
-		result = new LowerCaseFilter( result );
+		result = new LowerCaseFilter( matchVersion, result );
 		result = new StandardFilter( result );
 		result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                          result, stoptable );
@@ -227,7 +227,7 @@
       if (streams == null) {
         streams = new SavedStreams();
         streams.source = new StandardTokenizer(matchVersion, reader);
-        streams.result = new LowerCaseFilter(streams.source);
+        streams.result = new LowerCaseFilter(matchVersion, streams.source);
         streams.result = new StandardFilter(streams.result);
         streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                         streams.result, stoptable);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -181,7 +181,7 @@
 	public final TokenStream tokenStream( String fieldName, Reader reader ) {
                 TokenStream result = new StandardTokenizer( matchVersion, reader );
 		result = new StandardFilter( result );
-		result = new LowerCaseFilter( result );
+		result = new LowerCaseFilter( matchVersion, result );
 		result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                          result, stoptable );
 		return result;
@@ -207,7 +207,7 @@
         streams = new SavedStreams();
         streams.source = new StandardTokenizer(matchVersion, reader);
         streams.result = new StandardFilter(streams.source);
-        streams.result = new LowerCaseFilter(streams.result);
+        streams.result = new LowerCaseFilter(matchVersion, streams.result);
         streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                         streams.result, stoptable);
         setPreviousTokenStream(streams);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -200,7 +200,7 @@
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new StandardFilter(result);
-    result = new LowerCaseFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                             result, stopSet);
     result = new GermanStemFilter(result, exclusionSet);
@@ -234,7 +234,7 @@
       streams = new SavedStreams();
       streams.source = new StandardTokenizer(matchVersion, reader);
       streams.result = new StandardFilter(streams.source);
-      streams.result = new LowerCaseFilter(streams.result);
+      streams.result = new LowerCaseFilter(matchVersion, streams.result);
       streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                       streams.result, stopSet);
       streams.result = new GermanStemFilter(streams.result, exclusionSet);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -167,7 +167,7 @@
   @Override
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new ArabicLetterTokenizer(reader);
-    result = new LowerCaseFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
     result = new ArabicNormalizationFilter(result);
     /* additional persian-specific normalization */
     result = new PersianNormalizationFilter(result);
@@ -201,7 +201,7 @@
     if (streams == null) {
       streams = new SavedStreams();
       streams.source = new ArabicLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new LowerCaseFilter(matchVersion, streams.source);
       streams.result = new ArabicNormalizationFilter(streams.result);
       /* additional persian-specific normalization */
       streams.result = new PersianNormalizationFilter(streams.result);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -215,7 +215,7 @@
                             result, stoptable);
     result = new FrenchStemFilter(result, excltable);
     // Convert to lowercase after stemming!
-    result = new LowerCaseFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
     return result;
   }
   
@@ -244,7 +244,7 @@
                                       streams.result, stoptable);
       streams.result = new FrenchStemFilter(streams.result, excltable);
       // Convert to lowercase after stemming!
-      streams.result = new LowerCaseFilter(streams.result);
+      streams.result = new LowerCaseFilter(matchVersion, streams.result);
       setPreviousTokenStream(streams);
     } else {
       streams.source.reset(reader);

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -118,7 +118,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader)
     {
         TokenStream result = new RussianLetterTokenizer(reader);
-        result = new LowerCaseFilter(result);
+        result = new LowerCaseFilter(matchVersion, result);
         result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                 result, stopSet);
         result = new RussianStemFilter(result);
@@ -146,7 +146,7 @@
     if (streams == null) {
       streams = new SavedStreams();
       streams.source = new RussianLetterTokenizer(reader);
-      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new LowerCaseFilter(matchVersion, streams.source);
       streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                       streams.result, stopSet);
       streams.result = new RussianStemFilter(streams.result);

Modified: lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java (original)
+++ lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerQPHelper.java Fri Nov 27 21:34:11 2009
@@ -158,7 +158,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
       result = new TestFilter(result);
-      result = new LowerCaseFilter(result);
+      result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
       return result;
     }
   }
@@ -228,7 +228,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
       result = new TestPosIncrementFilter(result);
-      result = new LowerCaseFilter(result);
+      result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
       return result;
     }
   }

Modified: lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java (original)
+++ lucene/java/trunk/contrib/queryparser/src/test/org/apache/lucene/queryParser/standard/TestMultiAnalyzerWrapper.java Fri Nov 27 21:34:11 2009
@@ -152,7 +152,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
       result = new TestFilter(result);
-      result = new LowerCaseFilter(result);
+      result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
       return result;
     }
   }
@@ -222,7 +222,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
       result = new TestPosIncrementFilter(result);
-      result = new LowerCaseFilter(result);
+      result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
       return result;
     }
   }

Modified: lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (original)
+++ lucene/java/trunk/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -60,7 +60,7 @@
   public TokenStream tokenStream(String fieldName, Reader reader) {
     TokenStream result = new StandardTokenizer(matchVersion, reader);
     result = new StandardFilter(result);
-    result = new LowerCaseFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
     if (stopSet != null)
       result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                               result, stopSet);
@@ -91,7 +91,7 @@
       streams = new SavedStreams();
       streams.source = new StandardTokenizer(matchVersion, reader);
       streams.result = new StandardFilter(streams.source);
-      streams.result = new LowerCaseFilter(streams.result);
+      streams.result = new LowerCaseFilter(matchVersion, streams.result);
       if (stopSet != null)
         streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                         streams.result, stopSet);

Modified: lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java (original)
+++ lucene/java/trunk/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java Fri Nov 27 21:34:11 2009
@@ -29,6 +29,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
 
 public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
   File dataDir = new File(System.getProperty("dataDir", "./bin"));
@@ -96,7 +97,7 @@
     @Override
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream ts = new WhitespaceTokenizer(reader);
-      ts = new LowerCaseFilter(ts);
+      ts = new LowerCaseFilter(Version.LUCENE_CURRENT, ts);
       ts = new SynonymTokenFilter(ts, synonyms, maxSynonyms);
       return ts;
     }
@@ -113,7 +114,7 @@
       if (streams == null) {
         streams = new SavedStreams();
         streams.source = new WhitespaceTokenizer(reader);
-        streams.result = new LowerCaseFilter(streams.source);
+        streams.result = new LowerCaseFilter(Version.LUCENE_CURRENT, streams.source);
         streams.result = new SynonymTokenFilter(streams.result, synonyms, maxSynonyms);
         setPreviousTokenStream(streams);
       } else {

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/LowerCaseFilter.java Fri Nov 27 21:34:11 2009
@@ -20,14 +20,38 @@
 import java.io.IOException;
 
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.CharacterUtils;
+import org.apache.lucene.util.Version;
 
 /**
  * Normalizes token text to lower case.
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating LowerCaseFilter:
+ * <ul>
+ *   <li> As of 3.1, supplementary characters are properly lowercased.
+ * </ul>
  */
 public final class LowerCaseFilter extends TokenFilter {
-  public LowerCaseFilter(TokenStream in) {
+  private final CharacterUtils charUtils;
+
+  /**
+   * Create a new LowerCaseFilter, that normalizes token text to lower case.
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param in TokenStream to filter
+   */
+  public LowerCaseFilter(Version matchVersion, TokenStream in) {
     super(in);
     termAtt = addAttribute(TermAttribute.class);
+    charUtils = CharacterUtils.getInstance(matchVersion);
+  }
+  
+  /**
+   * @deprecated Use {@link #LowerCaseFilter(Version, TokenStream)} instead.
+   */
+  public LowerCaseFilter(TokenStream in) {
+    this(Version.LUCENE_30, in);
   }
 
   private TermAttribute termAtt;
@@ -35,12 +59,13 @@
   @Override
   public final boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
-
       final char[] buffer = termAtt.termBuffer();
       final int length = termAtt.termLength();
-      for(int i=0;i<length;i++)
-        buffer[i] = Character.toLowerCase(buffer[i]);
-
+      for (int i = 0; i < length;) {
+       i += Character.toChars(
+               Character.toLowerCase(
+                   charUtils.codePointAt(buffer, i)), buffer, i);
+      }
       return true;
     } else
       return false;

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -100,7 +100,7 @@
     StandardTokenizer tokenStream = new StandardTokenizer(matchVersion, reader);
     tokenStream.setMaxTokenLength(maxTokenLength);
     TokenStream result = new StandardFilter(tokenStream);
-    result = new LowerCaseFilter(result);
+    result = new LowerCaseFilter(matchVersion, result);
     result = new StopFilter(enableStopPositionIncrements, result, stopSet);
     return result;
   }
@@ -146,7 +146,8 @@
       setPreviousTokenStream(streams);
       streams.tokenStream = new StandardTokenizer(matchVersion, reader);
       streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
-      streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+      streams.filteredTokenStream = new LowerCaseFilter(matchVersion,
+          streams.filteredTokenStream);
       streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements,
                                                    streams.filteredTokenStream, stopSet);
     } else {

Added: lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java?rev=885024&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java Fri Nov 27 21:34:11 2009
@@ -0,0 +1,114 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * {@link CharacterUtils} provides a unified interface to Character-related
+ * operations to implement backwards compatible character operations based on a
+ * {@link Version} instance.
+ */
+public abstract class CharacterUtils {
+  private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
+  private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
+
+  /**
+   * Returns a {@link CharacterUtils} implementation according to the given
+   * {@link Version} instance.
+   * 
+   * @param matchVersion
+   *          a version instance
+   * @return a {@link CharacterUtils} implementation according to the given
+   *         {@link Version} instance.
+   */
+  public static CharacterUtils getInstance(Version matchVersion) {
+    return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
+  }
+
+  /**
+   * Returns the code point at the given index of the char array.
+   * Depending on the {@link Version} passed to
+   * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
+   * of {@link Character#codePointAt(char[], int)} as it would have been
+   * available on a Java 1.4 JVM or on a later virtual machine version.
+   * 
+   * @param chars
+   *          a character array
+   * @param offset
+   *          the offset to the char values in the chars array to be converted
+   * 
+   * @return the Unicode code point at the given index
+   * @throws NullPointerException
+   *           - if the array is null.
+   * @throws IndexOutOfBoundsException
+   *           - if the value offset is negative or not less than the length of
+   *           the char array.
+   */
+  public abstract int codePointAt(char[] chars, int offset);
+
+  /**
+   * Returns the code point at the given index of the {@link CharSequence}.
+   * Depending on the {@link Version} passed to
+   * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
+   * of {@link Character#codePointAt(char[], int)} as it would have been
+   * available on a Java 1.4 JVM or on a later virtual machine version.
+   * 
+   * @param seq
+   *          a character sequence
+   * @param offset
+   *          the offset to the char values in the chars array to be converted
+   * 
+   * @return the Unicode code point at the given index
+   * @throws NullPointerException
+   *           - if the sequence is null.
+   * @throws IndexOutOfBoundsException
+   *           - if the value offset is negative or not less than the length of
+   *           the character sequence.
+   */
+  public abstract int codePointAt(CharSequence seq, int offset);
+
+  private static final class Java5CharacterUtils extends CharacterUtils {
+    Java5CharacterUtils() {
+    };
+
+    @Override
+    public final int codePointAt(char[] chars, int offset) {
+      return Character.codePointAt(chars, offset);
+    }
+
+    @Override
+    public int codePointAt(CharSequence seq, int offset) {
+      return Character.codePointAt(seq, offset);
+    }
+  }
+
+  private static final class Java4CharacterUtils extends CharacterUtils {
+    Java4CharacterUtils() {
+    };
+
+    @Override
+    public final int codePointAt(char[] chars, int offset) {
+      return chars[offset];
+    }
+
+    @Override
+    public int codePointAt(CharSequence seq, int offset) {
+      return seq.charAt(offset);
+    }
+  }
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/util/CharacterUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestAnalyzers.java Fri Nov 27 21:34:11 2009
@@ -139,6 +139,99 @@
     assertTrue(ts.incrementToken());
     assertFalse(ts.incrementToken());
   }
+  
+  private static class LowerCaseWhitespaceAnalyzer extends Analyzer {
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new LowerCaseFilter(Version.LUCENE_CURRENT,
+          new WhitespaceTokenizer(reader));
+    }
+    
+  }
+  
+  /**
+   * @deprecated remove this when lucene 3.0 "broken unicode 4" support
+   * is no longer needed.
+   */
+  private static class LowerCaseWhitespaceAnalyzerBWComp extends Analyzer {
+
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new LowerCaseFilter(new WhitespaceTokenizer(reader));
+    }
+    
+  }
+  
+  /**
+   * Test that LowercaseFilter handles entire unicode range correctly
+   */
+  public void testLowerCaseFilter() throws IOException {
+    Analyzer a = new LowerCaseWhitespaceAnalyzer();
+    // BMP
+    assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
+    // supplementary
+    assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
+        new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"});
+    assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA", 
+        new String[] { "abaca\ud801\udc3edaba" });
+    // unpaired lead surrogate
+    assertAnalyzesTo(a, "AbaC\uD801AdaBa", 
+        new String [] { "abac\uD801adaba" });
+    // unpaired trail surrogate
+    assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
+        new String [] { "abac\uDC16adaba" });
+  }
+  
+  /**
+   * Test that LowercaseFilter handles the lowercasing correctly if the term
+   * buffer has a trailing surrogate character leftover and the current term in
+   * the buffer ends with a corresponding leading surrogate.
+   */
+  public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
+    // test if the limit of the termbuffer is correctly used with supplementary
+    // chars
+    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
+        "BogustermBogusterm\udc16"));
+    LowerCaseFilter filter = new LowerCaseFilter(Version.LUCENE_CURRENT,
+        tokenizer);
+    assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
+    filter.reset();
+    String highSurEndingUpper = "BogustermBoguster\ud801";
+    String highSurEndingLower = "bogustermboguster\ud801";
+    tokenizer.reset(new StringReader(highSurEndingUpper));
+    assertTokenStreamContents(filter, new String[] {highSurEndingLower});
+    assertTrue(filter.hasAttribute(TermAttribute.class));
+    char[] termBuffer = filter.getAttribute(TermAttribute.class).termBuffer();
+    int length = highSurEndingLower.length();
+    assertEquals('\ud801', termBuffer[length - 1]);
+    assertEquals('\udc3e', termBuffer[length]);
+    
+  }
+  
+  /**
+   * Test that LowercaseFilter only works on BMP for back compat,
+   * depending upon version
+   * @deprecated remove this test when lucene 3.0 "broken unicode 4" support
+   * is no longer needed.
+   */
+  public void testLowerCaseFilterBWComp() throws IOException {
+    Analyzer a = new LowerCaseWhitespaceAnalyzerBWComp();
+    // BMP
+    assertAnalyzesTo(a, "AbaCaDabA", new String[] { "abacadaba" });
+    // supplementary, no-op
+    assertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
+        new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
+    assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA",
+        new String[] { "abaca\ud801\udc16daba" });
+    // unpaired lead surrogate
+    assertAnalyzesTo(a, "AbaC\uD801AdaBa", 
+        new String [] { "abac\uD801adaba" });
+    // unpaired trail surrogate
+    assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
+        new String [] { "abac\uDC16adaba" });
+  }
+  
 }
 
 class PayloadSetter extends TokenFilter {

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestTeeSinkTokenFilter.java Fri Nov 27 21:34:11 2009
@@ -146,7 +146,7 @@
     assertEquals("there must be 2 times 'Dog' in the stream", 2, i);
     
     source1.reset();
-    TokenStream lowerCasing = new LowerCaseFilter(source1);
+    TokenStream lowerCasing = new LowerCaseFilter(Version.LUCENE_CURRENT, source1);
     i = 0;
     termAtt = lowerCasing.getAttribute(TermAttribute.class);
     while (lowerCasing.incrementToken()) {

Modified: lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java?rev=885024&r1=885023&r2=885024&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestMultiAnalyzer.java Fri Nov 27 21:34:11 2009
@@ -138,7 +138,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
       result = new TestFilter(result);
-      result = new LowerCaseFilter(result);
+      result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
       return result;
     }
   }
@@ -206,7 +206,7 @@
     public TokenStream tokenStream(String fieldName, Reader reader) {
       TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader);
       result = new TestPosIncrementFilter(result);
-      result = new LowerCaseFilter(result);
+      result = new LowerCaseFilter(Version.LUCENE_CURRENT, result);
       return result;
     }
   }