You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ry...@apache.org on 2014/01/08 20:54:13 UTC
svn commit: r1556618 - in /lucene/dev/trunk: ./ lucene/ lucene/analysis/
lucene/analysis/common/
lucene/analysis/common/src/java/org/apache/lucene/analysis/core/
lucene/analysis/common/src/java/org/apache/lucene/analysis/util/
lucene/analysis/common/sr...
Author: ryan
Date: Wed Jan 8 19:54:13 2014
New Revision: 1556618
URL: http://svn.apache.org/r1556618
Log:
LUCENE-5369: Added an UpperCaseFilter to make UPPERCASE tokens (merge from 4x)
Added:
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
- copied unchanged from r1556617, lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
- copied unchanged from r1556617, lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilterFactory.java
Modified:
lucene/dev/trunk/ (props changed)
lucene/dev/trunk/lucene/ (props changed)
lucene/dev/trunk/lucene/CHANGES.txt (contents, props changed)
lucene/dev/trunk/lucene/analysis/ (props changed)
lucene/dev/trunk/lucene/analysis/common/ (props changed)
lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1556618&r1=1556617&r2=1556618&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Jan 8 19:54:13 2014
@@ -83,6 +83,9 @@ New Features
* LUCENE-5379: Add Analyzer for Kurdish. (Robert Muir)
+* LUCENE-5369: Added an UpperCaseFilter to make UPPERCASE tokens. (ryan)
+
+
Build
* LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java?rev=1556618&r1=1556617&r2=1556618&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java Wed Jan 8 19:54:13 2014
@@ -132,6 +132,23 @@ public abstract class CharacterUtils {
}
}
+ /**
+ * Converts each unicode codepoint to UpperCase via {@link Character#toUpperCase(int)} starting
+ * at the given offset.
+ * @param buffer the char buffer to UPPERCASE
+ * @param offset the offset to start at
+ * @param limit the max char in the buffer to lower case
+ */
+ public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+ assert buffer.length >= limit;
+ assert offset <=0 && offset <= buffer.length;
+ for (int i = offset; i < limit;) {
+ i += Character.toChars(
+ Character.toUpperCase(
+ codePointAt(buffer, i, limit)), buffer, i);
+ }
+ }
+
/** Converts a sequence of Java characters to a sequence of unicode code points.
* @return the number of code points written to the destination buffer */
public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java?rev=1556618&r1=1556617&r2=1556618&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java Wed Jan 8 19:54:13 2014
@@ -128,6 +128,17 @@ public class TestAnalyzers extends BaseT
}
+ private static class UpperCaseWhitespaceAnalyzer extends Analyzer {
+
+ @Override
+ public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer));
+ }
+
+ }
+
+
/**
* Test that LowercaseFilter handles entire unicode range correctly
*/
@@ -147,6 +158,27 @@ public class TestAnalyzers extends BaseT
assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
new String [] { "abac\uDC16adaba" });
}
+
+ /**
+ * Test that LowercaseFilter handles entire unicode range correctly
+ */
+ public void testUpperCaseFilter() throws IOException {
+ Analyzer a = new UpperCaseWhitespaceAnalyzer();
+ // BMP
+ assertAnalyzesTo(a, "AbaCaDabA", new String[] { "ABACADABA" });
+ // supplementary
+ assertAnalyzesTo(a, "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e",
+ new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
+ assertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA",
+ new String[] { "ABACA\ud801\udc16DABA" });
+ // unpaired lead surrogate
+ assertAnalyzesTo(a, "AbaC\uD801AdaBa",
+ new String [] { "ABAC\uD801ADABA" });
+ // unpaired trail surrogate
+ assertAnalyzesTo(a, "AbaC\uDC16AdaBa",
+ new String [] { "ABAC\uDC16ADABA" });
+ }
+
/**
* Test that LowercaseFilter handles the lowercasing correctly if the term