You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/20 01:19:01 UTC
svn commit: r1579488 - in /lucene/dev/trunk/lucene: CHANGES.txt
analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
Author: rmuir
Date: Thu Mar 20 00:19:00 2014
New Revision: 1579488
URL: http://svn.apache.org/r1579488
Log:
LUCENE-4072: add ICUNormalizer2CharFilter
Added:
lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java (with props)
lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1579488&r1=1579487&r2=1579488&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Mar 20 00:19:00 2014
@@ -112,6 +112,9 @@ New Features
you update the value of a BinaryDocValuesField without reindexing the
document(s). (Shai Erera)
+* LUCENE-4072: Add ICUNormalizer2CharFilter, which lets you do unicode normalization
+ with offset correction before the tokenizer. (David Goldfarb, Ippei UKAI via Robert Muir)
+
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
Added: lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java?rev=1579488&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java Thu Mar 20 00:19:00 2014
@@ -0,0 +1,206 @@
+package org.apache.lucene.analysis.icu;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
+
+import com.ibm.icu.text.Normalizer2;
+
+/**
+ * Normalize token text with ICU's {@link Normalizer2}.
+ */
+public final class ICUNormalizer2CharFilter extends BaseCharFilter {
+
+ private static final int IO_BUFFER_SIZE = 128;
+
+ private final Normalizer2 normalizer;
+ private final StringBuilder inputBuffer = new StringBuilder();
+ private final StringBuilder resultBuffer = new StringBuilder();
+
+ private boolean inputFinished;
+ private boolean afterQuickCheckYes;
+ private int checkedInputBoundary;
+ private int charCount;
+
+
+ /**
+ * Create a new Normalizer2CharFilter that combines NFKC normalization, Case
+ * Folding, and removes Default Ignorables (NFKC_Casefold)
+ */
+ public ICUNormalizer2CharFilter(Reader in) {
+ this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+ }
+
+ /**
+ * Create a new Normalizer2CharFilter with the specified Normalizer2
+ * @param in text
+ * @param normalizer normalizer to use
+ */
+ public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
+ super(in);
+ if (normalizer == null) {
+ throw new NullPointerException("normalizer == null");
+ }
+ this.normalizer = normalizer;
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ if (off < 0) throw new IllegalArgumentException("off < 0");
+ if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
+ if (len <= 0) throw new IllegalArgumentException("len <= 0");
+
+ while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
+ int retLen;
+
+ if (resultBuffer.length() > 0) {
+ retLen = outputFromResultBuffer(cbuf, off, len);
+ if (retLen > 0) {
+ return retLen;
+ }
+ }
+
+ int resLen = readAndNormalizeFromInput();
+ if (resLen > 0) {
+ retLen = outputFromResultBuffer(cbuf, off, len);
+ if (retLen > 0) {
+ return retLen;
+ }
+ }
+
+ readInputToBuffer();
+ }
+
+ return -1;
+ }
+
+ private final char[] tmpBuffer = new char[IO_BUFFER_SIZE];
+
+ private int readInputToBuffer() throws IOException {
+ final int len = input.read(tmpBuffer);
+ if (len == -1) {
+ inputFinished = true;
+ return 0;
+ }
+ inputBuffer.append(tmpBuffer, 0, len);
+
+ // if checkedInputBoundary was at the end of a buffer, we need to check that char again
+ checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
+ if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isHighSurrogate(tmpBuffer[len-1])) {
+ return len;
+ } else return len + readInputToBuffer();
+ }
+
+ private int readAndNormalizeFromInput() {
+ if (inputBuffer.length() <= 0) {
+ afterQuickCheckYes = false;
+ return 0;
+ }
+ if (!afterQuickCheckYes) {
+ int resLen = readFromInputWhileSpanQuickCheckYes();
+ afterQuickCheckYes = true;
+ if (resLen > 0) return resLen;
+ }
+ int resLen = readFromIoNormalizeUptoBoundary();
+ if(resLen > 0){
+ afterQuickCheckYes = false;
+ }
+ return resLen;
+ }
+
+ private int readFromInputWhileSpanQuickCheckYes() {
+ int end = normalizer.spanQuickCheckYes(inputBuffer);
+ if (end > 0) {
+ resultBuffer.append(inputBuffer.subSequence(0, end));
+ inputBuffer.delete(0, end);
+ checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
+ charCount += end;
+ }
+ return end;
+ }
+
+ private int readFromIoNormalizeUptoBoundary() {
+ // if there's no buffer to normalize, return 0
+ if (inputBuffer.length() <= 0) {
+ return 0;
+ }
+
+ boolean foundBoundary = false;
+ final int bufLen = inputBuffer.length();
+
+ while (checkedInputBoundary <= bufLen - 1) {
+ int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
+ checkedInputBoundary += charLen;
+ if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer
+ .codePointAt(checkedInputBoundary))) {
+ foundBoundary = true;
+ break;
+ }
+ }
+ if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
+ foundBoundary = true;
+ checkedInputBoundary = bufLen;
+ }
+
+ if (!foundBoundary) {
+ return 0;
+ }
+
+ return normalizeInputUpto(checkedInputBoundary);
+ }
+
+ private int normalizeInputUpto(final int length) {
+ final int destOrigLen = resultBuffer.length();
+ normalizer.normalizeSecondAndAppend(resultBuffer,
+ inputBuffer.subSequence(0, length));
+ inputBuffer.delete(0, length);
+ checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
+ final int resultLength = resultBuffer.length() - destOrigLen;
+ recordOffsetDiff(length, resultLength);
+ return resultLength;
+ }
+
+ private void recordOffsetDiff(int inputLength, int outputLength) {
+ if (inputLength == outputLength) {
+ charCount += outputLength;
+ return;
+ }
+ final int diff = inputLength - outputLength;
+ final int cumuDiff = getLastCumulativeDiff();
+ if (diff < 0) {
+ for (int i = 1; i <= -diff; ++i) {
+ addOffCorrectMap(charCount + i, cumuDiff - i);
+ }
+ } else {
+ addOffCorrectMap(charCount + Math.min(1, outputLength), cumuDiff + diff);
+ }
+ charCount += outputLength;
+ }
+
+ private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
+ len = Math.min(resultBuffer.length(), len);
+ resultBuffer.getChars(0, len, cbuf, begin);
+ if (len > 0) {
+ resultBuffer.delete(0, len);
+ }
+ return len;
+ }
+}
\ No newline at end of file
Added: lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java?rev=1579488&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java (added)
+++ lucene/dev/trunk/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java Thu Mar 20 00:19:00 2014
@@ -0,0 +1,225 @@
+package org.apache.lucene.analysis.icu;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.util.TestUtil;
+
+import com.ibm.icu.text.Normalizer2;
+
+public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
+
+ public void testNormalization() throws IOException {
+ String input = "Ê°ã°ã5ââã±ãï¼ãããã¡ã¼ã®æ£è¦åã®ãã¹ãï¼ãããããカキクケコサï¾ï½¼ï¾ï½½ï¾ï½¾ï¾ï½¿ï¾gÌê°/ê°à®¨à®¿à¹à¸à¤·à¤¿chkÊ·à¤à¥à¤·à¤¿";
+ Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
+ String expectedOutput = normalizer.normalize(input);
+
+ CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
+ char[] tempBuff = new char[10];
+ StringBuilder output = new StringBuilder();
+ while (true) {
+ int length = reader.read(tempBuff);
+ if (length == -1) {
+ break;
+ }
+ output.append(tempBuff, 0, length);
+ assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
+ }
+
+ assertEquals(expectedOutput, output.toString());
+ }
+
+ public void testTokenStream() throws IOException {
+ // 'â', 'â', 'ã±', 'ã', 'ï½»'+'<<', 'ソ'+'<<', 'ã°'+'<<'
+ String input = "â â ã± ã ï½»ï¾ ï½¿ï¾ ã°ï¾";
+
+ CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
+ Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
+
+ Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ tokenStream.setReader(reader);
+
+ assertTokenStreamContents(tokenStream,
+ new String[] {"°C", "No", "(æ ª)", "ã°ã©ã ", "ã¶", "ã¾", "ãã´"},
+ new int[] {0, 2, 4, 6, 8, 11, 14},
+ new int[] {1, 3, 5, 7, 10, 13, 16},
+ input.length());
+ }
+
+ public void testTokenStream2() throws IOException {
+ // 'ã°', '<<'ã, '5', 'â', 'â', 'ã±', 'ã', 'ï½»', '<<', 'ソ', '<<'
+ String input = "ã°ã5ââã±ãï½»ï¾ï½¿ï¾";
+
+ CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
+ Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+
+ Tokenizer tokenStream = new NGramTokenizer(TEST_VERSION_CURRENT, 1, 1);
+ tokenStream.setReader(reader);
+
+ assertTokenStreamContents(tokenStream,
+ new String[] {"ã", "ã´", "5", "°", "c", "n", "o", "(", "æ ª", ")", "ã°", "ã©", "ã ", "ã¶", "ã¾"},
+ new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
+ new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
+ input.length()
+ );
+ }
+
+ public void testMassiveLigature() throws IOException {
+ String input = "\uFDFA";
+
+ CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
+ Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+
+ Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ tokenStream.setReader(reader);
+
+ assertTokenStreamContents(tokenStream,
+ new String[] {"صÙÙ", "اÙÙÙ", "عÙÙÙ", "ÙسÙÙ
"},
+ new int[]{0, 0, 0, 0},
+ new int[]{0, 0, 0, 1},
+ input.length()
+ );
+ }
+
+ public void doTestMode(final Normalizer2 normalizer, int maxLength, int iterations) throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
+ }
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ return new ICUNormalizer2CharFilter(reader, normalizer);
+ }
+ };
+
+ for (int i = 0; i < iterations; i++) {
+ String input = TestUtil.randomUnicodeString(random(), maxLength);
+ if (input.length() == 0) {
+ continue;
+ }
+ String normalized = normalizer.normalize(input);
+ if (normalized.length() == 0) {
+ continue; // MockTokenizer doesnt tokenize empty string...
+ }
+ checkOneTerm(a, input, normalized);
+ }
+ }
+
+ public void testNFC() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
+ }
+
+ public void testNFCHuge() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
+ }
+
+ public void testNFD() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
+ }
+
+ public void testNFDHuge() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
+ }
+
+ public void testNFKC() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
+ }
+
+ public void testNFKCHuge() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
+ }
+
+ public void testNFKD() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 20, RANDOM_MULTIPLIER*1000);
+ }
+
+ public void testNFKDHuge() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE), 8192, RANDOM_MULTIPLIER*100);
+ }
+
+ public void testNFKC_CF() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 20, RANDOM_MULTIPLIER*1000);
+ }
+
+ public void testNFKC_CFHuge() throws Exception {
+ doTestMode(Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE), 8192, RANDOM_MULTIPLIER*100);
+ }
+
+ public void testRandomStrings() throws IOException {
+ // nfkc_cf
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+ }
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+ }
+ };
+ checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ // huge strings
+ checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
+
+ // nfkd
+ a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+ }
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE));
+ }
+ };
+ checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ // huge strings
+ checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192);
+ }
+
+ public void testCuriousString() throws Exception {
+ String text = "\udb40\udc3d\uf273\ue960\u06c8\ud955\udc13\ub7fc\u0692 \u2089\u207b\u2073\u2075";
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
+ }
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+ }
+ };
+ for (int i = 0; i < 1000; i++) {
+ checkAnalysisConsistency(random(), a, false, text);
+ }
+ }
+}