You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ry...@apache.org on 2014/01/08 20:54:13 UTC

svn commit: r1556618 - in /lucene/dev/trunk: ./ lucene/ lucene/analysis/ lucene/analysis/common/ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/ lucene/analysis/common/sr...

Author: ryan
Date: Wed Jan  8 19:54:13 2014
New Revision: 1556618

URL: http://svn.apache.org/r1556618
Log:
LUCENE-5369: Added an UpperCaseFilter to make UPPERCASE tokens (merge from 4x)

Added:
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
      - copied unchanged from r1556617, lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
      - copied unchanged from r1556617, lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
Modified:
    lucene/dev/trunk/   (props changed)
    lucene/dev/trunk/lucene/   (props changed)
    lucene/dev/trunk/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/trunk/lucene/analysis/   (props changed)
    lucene/dev/trunk/lucene/analysis/common/   (props changed)
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1556618&r1=1556617&r2=1556618&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Jan  8 19:54:13 2014
@@ -83,6 +83,9 @@ New Features
 
 * LUCENE-5379: Add Analyzer for Kurdish.  (Robert Muir)
 
+* LUCENE-5369: Added an UpperCaseFilter to make UPPERCASE tokens. (ryan)
+
+
 Build
 
 * LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java?rev=1556618&r1=1556617&r2=1556618&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java Wed Jan  8 19:54:13 2014
@@ -132,6 +132,23 @@ public abstract class CharacterUtils {
      }
   }
 
+  /**
+   * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting 
+   * at the given offset.
+   * @param buffer the char buffer to UPPERCASE
+   * @param offset the offset to start at
+   * @param limit the max char in the buffer to lower case
+   */
+  public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+    assert buffer.length >= limit;
+    assert offset <=0 && offset <= buffer.length;
+    for (int i = offset; i < limit;) {
+      i += Character.toChars(
+              Character.toUpperCase(
+                  codePointAt(buffer, i, limit)), buffer, i);
+     }
+  }
+
   /** Converts a sequence of Java characters to a sequence of unicode code points.
    *  @return the number of code points written to the destination buffer */
   public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java?rev=1556618&r1=1556617&r2=1556618&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java Wed Jan  8 19:54:13 2014
@@ -128,6 +128,17 @@ public class TestAnalyzers extends BaseT
     
   }
   
+  private static class UpperCaseWhitespaceAnalyzer extends Analyzer {
+
+    @Override
+    public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer));
+    }
+    
+  }
+  
+  
   /**
    * Test that LowercaseFilter handles entire unicode range correctly
    */
@@ -147,6 +158,27 @@ public class TestAnalyzers extends BaseT
     assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
         new String [] { "abac\uDC16adaba" });
   }
+
+  /**
+   * Test that LowercaseFilter handles entire unicode range correctly
+   */
+  public void testUpperCaseFilter() throws IOException {
+    Analyzer a = new UpperCaseWhitespaceAnalyzer();
+    // BMP
+    assertAnalyzesTo(a, "AbaCaDabA", new String[] { "ABACADABA" });
+    // supplementary
+    assertAnalyzesTo(a, "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e",
+          new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
+    assertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA", 
+         new String[] { "ABACA\ud801\udc16DABA" });
+    // unpaired lead surrogate
+    assertAnalyzesTo(a, "AbaC\uD801AdaBa", 
+        new String [] { "ABAC\uD801ADABA" });
+    // unpaired trail surrogate
+    assertAnalyzesTo(a, "AbaC\uDC16AdaBa", 
+        new String [] { "ABAC\uDC16ADABA" });
+  }
+  
   
   /**
    * Test that LowercaseFilter handles the lowercasing correctly if the term