You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/06/13 13:34:48 UTC
[2/2] lucene-solr:branch_6x: LUCENE-7329: Simplify CharacterUtils.
LUCENE-7329: Simplify CharacterUtils.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/061f6880
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/061f6880
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/061f6880
Branch: refs/heads/branch_6x
Commit: 061f688022debf8db001886bc4e4847cc03c572d
Parents: f1ddc55
Author: Adrien Grand <jp...@gmail.com>
Authored: Mon Jun 13 15:23:08 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Mon Jun 13 15:33:57 2016 +0200
----------------------------------------------------------------------
.../lucene/analysis/core/LowerCaseFilter.java | 3 +-
.../lucene/analysis/core/UpperCaseFilter.java | 3 +-
.../analysis/el/GreekLowerCaseFilter.java | 4 +-
.../analysis/ngram/EdgeNGramTokenFilter.java | 7 +-
.../lucene/analysis/ngram/NGramTokenFilter.java | 13 +-
.../lucene/analysis/ngram/NGramTokenizer.java | 8 +-
.../lucene/analysis/util/CharArrayMap.java | 17 +-
.../lucene/analysis/util/CharTokenizer.java | 5 +-
.../lucene/analysis/util/CharacterUtils.java | 229 ++++---------------
.../TestStemmerOverrideFilter.java | 4 +-
.../analysis/util/TestCharacterUtils.java | 155 ++-----------
.../analysis/morfologik/MorfologikFilter.java | 4 +-
12 files changed, 86 insertions(+), 366 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
index d1198a6..ade6a58 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
@@ -28,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
* Normalizes token text to lower case.
*/
public final class LowerCaseFilter extends TokenFilter {
- private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
@@ -43,7 +42,7 @@ public final class LowerCaseFilter extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+ CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
index 9c2c283..6d3f6bb 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
@@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
* general search matching
*/
public final class UpperCaseFilter extends TokenFilter {
- private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
@@ -48,7 +47,7 @@ public final class UpperCaseFilter extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- charUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
+ CharacterUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
index e4aecf3..3185b2d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@@ -21,7 +21,6 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Normalizes token text to lower case, removes some Greek diacritics,
@@ -29,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
*/
public final class GreekLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final CharacterUtils charUtils = CharacterUtils.getInstance();
/**
* Create a GreekLowerCaseFilter that normalizes Greek token text.
@@ -47,7 +45,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
int chLen = termAtt.length();
for (int i = 0; i < chLen;) {
i += Character.toChars(
- lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
+ lowerCase(Character.codePointAt(chArray, i, chLen)), chArray, i);
}
return true;
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
index 2c10778..827e26f 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@@ -25,7 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Tokenizes the given token into n-grams of given size(s).
@@ -38,7 +37,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
- private final CharacterUtils charUtils;
private final int minGram;
private final int maxGram;
private char[] curTermBuffer;
@@ -73,7 +71,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
- this.charUtils = CharacterUtils.getInstance();
this.minGram = minGram;
this.maxGram = maxGram;
}
@@ -87,7 +84,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
- curCodePointCount = charUtils.codePointCount(termAtt);
+ curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
@@ -108,7 +105,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
- final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
+ final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
index 5a84bff..e275cfa 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@@ -26,7 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Tokenizes the input into n-grams of the given size(s).
@@ -56,9 +55,7 @@ public final class NGramTokenFilter extends TokenFilter {
private int curPosInc, curPosLen;
private int tokStart;
private int tokEnd;
- private boolean hasIllegalOffsets; // only if the length changed before this filter
- private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
@@ -72,7 +69,6 @@ public final class NGramTokenFilter extends TokenFilter {
*/
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
- this.charUtils = CharacterUtils.getInstance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -104,16 +100,13 @@ public final class NGramTokenFilter extends TokenFilter {
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
- curCodePointCount = charUtils.codePointCount(termAtt);
+ curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
curPosLen = posLenAtt.getPositionLength();
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
- // if length by start + end offsets doesn't match the term text then assume
- // this is a synonym and don't adjust the offsets.
- hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
}
}
@@ -123,8 +116,8 @@ public final class NGramTokenFilter extends TokenFilter {
}
if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes();
- final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
- final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+ final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+ final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
index 1c8aa7c..da104c9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@@ -57,7 +57,6 @@ public class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
- private CharacterUtils charUtils;
private CharacterUtils.CharacterBuffer charBuffer;
private int[] buffer; // like charBuffer, but converted to code points
private int bufferStart, bufferEnd; // remaining slice in buffer
@@ -110,7 +109,6 @@ public class NGramTokenizer extends Tokenizer {
}
private void init(int minGram, int maxGram, boolean edgesOnly) {
- charUtils = CharacterUtils.getInstance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -142,9 +140,9 @@ public class NGramTokenizer extends Tokenizer {
bufferStart = 0;
// fill in remaining space
- exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
+ exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
// convert to code points
- bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
+ bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
}
// should we go to the next offset?
@@ -168,7 +166,7 @@ public class NGramTokenizer extends Tokenizer {
continue;
}
- final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+ final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
termAtt.setLength(length);
posIncAtt.setPositionIncrement(1);
posLenAtt.setPositionLength(1);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
index 289ee08..e414366 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
@@ -40,7 +40,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
private final static int INIT_SIZE = 8;
- private final CharacterUtils charUtils;
private boolean ignoreCase;
private int count;
char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
@@ -63,7 +62,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
size <<= 1;
keys = new char[size][];
values = (V[]) new Object[size];
- this.charUtils = CharacterUtils.getInstance();
}
/**
@@ -86,7 +84,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
this.values = toCopy.values;
this.ignoreCase = toCopy.ignoreCase;
this.count = toCopy.count;
- this.charUtils = toCopy.charUtils;
}
/** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
@@ -192,7 +189,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
*/
public V put(char[] text, V value) {
if (ignoreCase) {
- charUtils.toLowerCase(text, 0, text.length);
+ CharacterUtils.toLowerCase(text, 0, text.length);
}
int slot = getSlot(text, 0, text.length);
if (keys[slot] != null) {
@@ -237,8 +234,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
final int limit = off+len;
if (ignoreCase) {
for(int i=0;i<len;) {
- final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
- if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
+ final int codePointAt = Character.codePointAt(text1, off+i, limit);
+ if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
@@ -257,8 +254,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
return false;
if (ignoreCase) {
for(int i=0;i<len;) {
- final int codePointAt = charUtils.codePointAt(text1, i);
- if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
+ final int codePointAt = Character.codePointAt(text1, i);
+ if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
@@ -278,7 +275,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
final int stop = offset + len;
if (ignoreCase) {
for (int i=offset; i<stop;) {
- final int codePointAt = charUtils.codePointAt(text, i, stop);
+ final int codePointAt = Character.codePointAt(text, i, stop);
code = code*31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}
@@ -297,7 +294,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
int len = text.length();
if (ignoreCase) {
for (int i=0; i<len;) {
- int codePointAt = charUtils.codePointAt(text, i);
+ int codePointAt = Character.codePointAt(text, i);
code = code*31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
index 7683239..4952f99 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@@ -199,7 +199,6 @@ public abstract class CharTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
/**
@@ -229,7 +228,7 @@ public abstract class CharTokenizer extends Tokenizer {
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
- charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
+ CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
if (ioBuffer.getLength() == 0) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
@@ -243,7 +242,7 @@ public abstract class CharTokenizer extends Tokenizer {
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
- final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
+ final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int charCount = Character.charCount(c);
bufferIndex += charCount;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
index f14b1f7..b728523 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
@@ -20,76 +20,13 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.io.Reader;
-import org.apache.lucene.util.Version;
-
/**
- * {@link CharacterUtils} provides a unified interface to Character-related
- * operations to implement backwards compatible character operations based on a
- * {@link Version} instance.
- *
+ * Utility class to write tokenizers or token filters.
* @lucene.internal
*/
-public abstract class CharacterUtils {
- private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
- private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
-
- /**
- * Returns a {@link CharacterUtils} implementation.
- * @return a {@link CharacterUtils} implementation according to the given
- * {@link Version} instance.
- */
- public static CharacterUtils getInstance() {
- return JAVA_5;
- }
-
- /**
- * explicitly returns a version matching java 4 semantics
- * @deprecated Only for n-gram backwards compat
- */
- @Deprecated
- public static CharacterUtils getJava4Instance() {
- return JAVA_4;
- }
-
- /**
- * Returns the code point at the given index of the {@link CharSequence}.
- *
- * @param seq
- * a character sequence
- * @param offset
- * the offset to the char values in the chars array to be converted
- *
- * @return the Unicode code point at the given index
- * @throws NullPointerException
- * - if the sequence is null.
- * @throws IndexOutOfBoundsException
- * - if the value offset is negative or not less than the length of
- * the character sequence.
- */
- public abstract int codePointAt(final CharSequence seq, final int offset);
-
- /**
- * Returns the code point at the given index of the char array where only elements
- * with index less than the limit are used.
- *
- * @param chars
- * a character array
- * @param offset
- * the offset to the char values in the chars array to be converted
- * @param limit the index afer the last element that should be used to calculate
- * codepoint.
- *
- * @return the Unicode code point at the given index
- * @throws NullPointerException
- * - if the array is null.
- * @throws IndexOutOfBoundsException
- * - if the value offset is negative or not less than the length of
- * the char array.
- */
- public abstract int codePointAt(final char[] chars, final int offset, final int limit);
+public final class CharacterUtils {
- /** Return the number of characters in <code>seq</code>. */
- public abstract int codePointCount(CharSequence seq);
+ private CharacterUtils() {} // no instantiation
/**
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
@@ -114,13 +51,13 @@ public abstract class CharacterUtils {
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
- public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
+ public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toLowerCase(
- codePointAt(buffer, i, limit)), buffer, i);
+ Character.codePointAt(buffer, i, limit)), buffer, i);
}
}
@@ -131,25 +68,25 @@ public abstract class CharacterUtils {
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
- public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+ public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toUpperCase(
- codePointAt(buffer, i, limit)), buffer, i);
+ Character.codePointAt(buffer, i, limit)), buffer, i);
}
}
/** Converts a sequence of Java characters to a sequence of unicode code points.
* @return the number of code points written to the destination buffer */
- public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+ public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
if (srcLen < 0) {
throw new IllegalArgumentException("srcLen must be >= 0");
}
int codePointCount = 0;
for (int i = 0; i < srcLen; ) {
- final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+ final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
final int charCount = Character.charCount(cp);
dest[destOff + codePointCount++] = cp;
i += charCount;
@@ -159,7 +96,7 @@ public abstract class CharacterUtils {
/** Converts a sequence of unicode code points to a sequence of Java characters.
* @return the number of chars written to the destination buffer */
- public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+ public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
if (srcLen < 0) {
throw new IllegalArgumentException("srcLen must be >= 0");
}
@@ -202,17 +139,45 @@ public abstract class CharacterUtils {
* @throws IOException
* if the reader throws an {@link IOException}.
*/
- public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
+ public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
+ assert buffer.buffer.length >= 2;
+ if (numChars < 2 || numChars > buffer.buffer.length) {
+ throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+ }
+ final char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ final int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0) {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ buffer.lastTrailingHighSurrogate = 0;
+ offset = 1;
+ } else {
+ offset = 0;
+ }
+
+ final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+ buffer.length = offset + read;
+ final boolean result = buffer.length == numChars;
+ if (buffer.length < numChars) {
+ // We failed to fill the buffer. Even if the last char is a high
+ // surrogate, there is nothing we can do
+ return result;
+ }
+
+ if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return result;
+ }
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
- public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+ public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
return fill(buffer, reader, buffer.buffer.length);
}
- /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
- * code points from <code>index</code>. */
- public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
-
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
int read = 0;
while (read < len) {
@@ -225,112 +190,6 @@ public abstract class CharacterUtils {
return read;
}
- private static final class Java5CharacterUtils extends CharacterUtils {
- Java5CharacterUtils() {
- }
-
- @Override
- public int codePointAt(final CharSequence seq, final int offset) {
- return Character.codePointAt(seq, offset);
- }
-
- @Override
- public int codePointAt(final char[] chars, final int offset, final int limit) {
- return Character.codePointAt(chars, offset, limit);
- }
-
- @Override
- public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
- assert buffer.buffer.length >= 2;
- if (numChars < 2 || numChars > buffer.buffer.length) {
- throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
- }
- final char[] charBuffer = buffer.buffer;
- buffer.offset = 0;
- final int offset;
-
- // Install the previously saved ending high surrogate:
- if (buffer.lastTrailingHighSurrogate != 0) {
- charBuffer[0] = buffer.lastTrailingHighSurrogate;
- buffer.lastTrailingHighSurrogate = 0;
- offset = 1;
- } else {
- offset = 0;
- }
-
- final int read = readFully(reader, charBuffer, offset, numChars - offset);
-
- buffer.length = offset + read;
- final boolean result = buffer.length == numChars;
- if (buffer.length < numChars) {
- // We failed to fill the buffer. Even if the last char is a high
- // surrogate, there is nothing we can do
- return result;
- }
-
- if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
- buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
- }
- return result;
- }
-
- @Override
- public int codePointCount(CharSequence seq) {
- return Character.codePointCount(seq, 0, seq.length());
- }
-
- @Override
- public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
- return Character.offsetByCodePoints(buf, start, count, index, offset);
- }
- }
-
- private static final class Java4CharacterUtils extends CharacterUtils {
- Java4CharacterUtils() {
- }
-
- @Override
- public int codePointAt(final CharSequence seq, final int offset) {
- return seq.charAt(offset);
- }
-
- @Override
- public int codePointAt(final char[] chars, final int offset, final int limit) {
- if(offset >= limit)
- throw new IndexOutOfBoundsException("offset must be less than limit");
- return chars[offset];
- }
-
- @Override
- public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
- throws IOException {
- assert buffer.buffer.length >= 1;
- if (numChars < 1 || numChars > buffer.buffer.length) {
- throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
- }
- buffer.offset = 0;
- final int read = readFully(reader, buffer.buffer, 0, numChars);
- buffer.length = read;
- buffer.lastTrailingHighSurrogate = 0;
- return read == numChars;
- }
-
- @Override
- public int codePointCount(CharSequence seq) {
- return seq.length();
- }
-
- @Override
- public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
- final int result = index + offset;
- if (result < 0 || result > count) {
- throw new IndexOutOfBoundsException();
- }
- return result;
- }
-
- }
-
/**
* A simple IO buffer to use with
* {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
index d8b2fca..ef4856c 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
@@ -85,8 +85,6 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
int numTerms = atLeast(50);
boolean ignoreCase = random().nextBoolean();
- CharacterUtils charUtils = CharacterUtils.getInstance();
-
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil
.randomRealisticUnicodeString(random());
@@ -107,7 +105,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
if (ignoreCase) {
// TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
char[] buffer = inputValue.toCharArray();
- charUtils.toLowerCase(buffer, 0, buffer.length);
+ CharacterUtils.toLowerCase(buffer, 0, buffer.length);
seenInputValue = buffer.toString();
} else {
seenInputValue = inputValue;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
index 2faeec7..04e96ea 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
@@ -32,102 +32,15 @@ import org.junit.Test;
*/
public class TestCharacterUtils extends LuceneTestCase {
- @Test
- public void testCodePointAtCharSequenceInt() {
- CharacterUtils java4 = CharacterUtils.getJava4Instance();
- String cpAt3 = "Abc\ud801\udc1c";
- String highSurrogateAt3 = "Abc\ud801";
- assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
- assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
- assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
- expectThrows(IndexOutOfBoundsException.class, () -> {
- java4.codePointAt(highSurrogateAt3, 4);
- });
-
- CharacterUtils java5 = CharacterUtils.getInstance();
- assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
- assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
- cpAt3, 3));
- assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
- expectThrows(IndexOutOfBoundsException.class, () -> {
- java5.codePointAt(highSurrogateAt3, 4);
- });
- }
-
- @Test
- public void testCodePointAtCharArrayIntInt() {
- CharacterUtils java4 = CharacterUtils.getJava4Instance();
- char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
- char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
- assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
- assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3, 5));
- assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3, 4));
-
- CharacterUtils java5 = CharacterUtils.getInstance();
- assertEquals((int) 'A', java5.codePointAt(cpAt3, 0, 2));
- assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
- cpAt3, 3, 5));
- assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
- }
-
- @Test
- public void testCodePointCount() {
- CharacterUtils java4 = CharacterUtils.getJava4Instance();
- CharacterUtils java5 = CharacterUtils.getInstance();
- final String s = TestUtil.randomUnicodeString(random());
- assertEquals(s.length(), java4.codePointCount(s));
- assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
- }
-
- @Test
- public void testOffsetByCodePoint() {
- CharacterUtils java4 = CharacterUtils.getJava4Instance();
- CharacterUtils java5 = CharacterUtils.getInstance();
- for (int i = 0; i < 10; ++i) {
- final char[] s = TestUtil.randomUnicodeString(random()).toCharArray();
- final int index = TestUtil.nextInt(random(), 0, s.length);
- final int offset = random().nextInt(7) - 3;
- try {
- final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
- assertEquals(o, index + offset);
- } catch (IndexOutOfBoundsException e) {
- assertTrue((index + offset) < 0 || (index + offset) > s.length);
- }
-
- int o;
- try {
- o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
- } catch (IndexOutOfBoundsException e) {
- try {
- Character.offsetByCodePoints(s, 0, s.length, index, offset);
- fail();
- } catch (IndexOutOfBoundsException e2) {
- // OK
- }
- o = -1;
- }
- if (o >= 0) {
- assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
- }
- }
- }
-
public void testConversions() {
- CharacterUtils java4 = CharacterUtils.getJava4Instance();
- CharacterUtils java5 = CharacterUtils.getInstance();
- testConversions(java4);
- testConversions(java5);
- }
-
- private void testConversions(CharacterUtils charUtils) {
final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
final int[] buf = new int[orig.length];
final char[] restored = new char[buf.length];
final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
final int o2 = TestUtil.nextInt(random(), 0, o1);
final int o3 = TestUtil.nextInt(random(), 0, o1);
- final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
- final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
+ final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
+ final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
assertEquals(orig.length - o1, charCount);
assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
}
@@ -152,71 +65,43 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testFillNoHighSurrogate() throws IOException {
- CharacterUtils versions[] = new CharacterUtils[] {
- CharacterUtils.getInstance(),
- CharacterUtils.getJava4Instance() };
- for (CharacterUtils instance : versions) {
- Reader reader = new StringReader("helloworld");
- CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
- assertTrue(instance.fill(buffer,reader));
- assertEquals(0, buffer.getOffset());
- assertEquals(6, buffer.getLength());
- assertEquals("hellow", new String(buffer.getBuffer()));
- assertFalse(instance.fill(buffer,reader));
- assertEquals(4, buffer.getLength());
- assertEquals(0, buffer.getOffset());
+ Reader reader = new StringReader("helloworld");
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
+ assertTrue(CharacterUtils.fill(buffer,reader));
+ assertEquals(0, buffer.getOffset());
+ assertEquals(6, buffer.getLength());
+ assertEquals("hellow", new String(buffer.getBuffer()));
+ assertFalse(CharacterUtils.fill(buffer,reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals(0, buffer.getOffset());
- assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
- buffer.getLength()));
- assertFalse(instance.fill(buffer,reader));
- }
+ assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertFalse(CharacterUtils.fill(buffer,reader));
}
@Test
- public void testFillJava15() throws IOException {
+ public void testFill() throws IOException {
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
- CharacterUtils instance = CharacterUtils.getInstance();
Reader reader = new StringReader(input);
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
- assertTrue(instance.fill(buffer, reader));
+ assertTrue(CharacterUtils.fill(buffer, reader));
assertEquals(4, buffer.getLength());
assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
buffer.getLength()));
- assertTrue(instance.fill(buffer, reader));
+ assertTrue(CharacterUtils.fill(buffer, reader));
assertEquals(5, buffer.getLength());
assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
- assertTrue(instance.fill(buffer, reader));
+ assertTrue(CharacterUtils.fill(buffer, reader));
assertEquals(4, buffer.getLength());
assertEquals("123\ud801", new String(buffer.getBuffer(),
buffer.getOffset(), buffer.getLength()));
- assertFalse(instance.fill(buffer, reader));
+ assertFalse(CharacterUtils.fill(buffer, reader));
assertEquals(3, buffer.getLength());
assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
.getOffset(), buffer.getLength()));
- assertFalse(instance.fill(buffer, reader));
+ assertFalse(CharacterUtils.fill(buffer, reader));
assertEquals(0, buffer.getLength());
}
- @Test
- public void testFillJava14() throws IOException {
- String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
- CharacterUtils instance = CharacterUtils.getJava4Instance();
- Reader reader = new StringReader(input);
- CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
- assertTrue(instance.fill(buffer, reader));
- assertEquals(5, buffer.getLength());
- assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer
- .getOffset(), buffer.getLength()));
- assertTrue(instance.fill(buffer, reader));
- assertEquals(5, buffer.getLength());
- assertEquals("\udc1c7891", new String(buffer.getBuffer()));
- buffer = CharacterUtils.newCharacterBuffer(6);
- assertTrue(instance.fill(buffer, reader));
- assertEquals(6, buffer.getLength());
- assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
- .getOffset(), buffer.getLength()));
- assertFalse(instance.fill(buffer, reader));
-
- }
-
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/061f6880/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
index ff36dbe..3429d86 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@@ -34,7 +34,6 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.CharsRefBuilder;
/**
@@ -54,7 +53,6 @@ public class MorfologikFilter extends TokenFilter {
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRefBuilder scratch = new CharsRefBuilder();
- private final CharacterUtils charUtils = CharacterUtils.getInstance();
private State current;
private final TokenStream input;
@@ -154,7 +152,7 @@ public class MorfologikFilter extends TokenFilter {
char buffer[] = scratch.chars();
for (int i = 0; i < length;) {
i += Character.toChars(
- Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
+ Character.toLowerCase(Character.codePointAt(chs, i)), buffer, i);
}
return scratch.get();