You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2020/11/17 08:32:27 UTC
[lucene-solr] branch master updated: LUCENE-9413: Add
CJKWidthCharFilter and its factory. (#2081)
This is an automated email from the ASF dual-hosted git repository.
tomoko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 8503efd LUCENE-9413: Add CJKWidthCharFilter and its factory. (#2081)
8503efd is described below
commit 8503efdcff91461114a26f6aaae180a90570da2b
Author: Tomoko Uchida <to...@gmail.com>
AuthorDate: Tue Nov 17 17:32:10 2020 +0900
LUCENE-9413: Add CJKWidthCharFilter and its factory. (#2081)
---
lucene/CHANGES.txt | 2 +
.../lucene/analysis/cjk/CJKWidthCharFilter.java | 148 +++++++++++++++++++++
.../analysis/cjk/CJKWidthCharFilterFactory.java | 57 ++++++++
.../org.apache.lucene.analysis.CharFilterFactory | 1 +
.../analysis/cjk/TestCJKWidthCharFilter.java | 118 ++++++++++++++++
.../cjk/TestCJKWidthCharFilterFactory.java | 43 ++++++
6 files changed, 369 insertions(+)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0fc90b4..c114bc9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -230,6 +230,8 @@ New Features
* LUCENE-9378: Doc values now allow configuring how to trade compression for
retrieval speed. (Adrien Grand)
+* LUCENE-9413: Add CJKWidthCharFilter and its factory (Tomoko Uchida)
+
Improvements
---------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
new file mode 100644
index 0000000..4fb7ced
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilter.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cjk;
+
+import org.apache.lucene.analysis.charfilter.BaseCharFilter;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * A {@link org.apache.lucene.analysis.CharFilter} that normalizes CJK width differences:
+ * <ul>
+ * <li>Folds fullwidth ASCII variants into the equivalent basic latin
+ * <li>Folds halfwidth Katakana variants into the equivalent kana
+ * </ul>
+ * <p>
+ * NOTE: this char filter is the exact counterpart of {@link CJKWidthFilter}.
+ */
+public class CJKWidthCharFilter extends BaseCharFilter {
+
+ /* halfwidth kana mappings: 0xFF65-0xFF9D
+ *
+ * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
+ * as a fallback when they cannot properly combine with a preceding
+ * character into a composed form.
+ */
+ private static final char KANA_NORM[] = new char[] {
+ 0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
+ 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
+ 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
+ 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
+ 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
+ 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
+ 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
+ };
+
+ /* kana combining diffs: 0x30A6-0x30FD */
+ private static final byte KANA_COMBINE_VOICED[] = new byte[] {
+ 78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
+ 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+ };
+
+ private static final byte KANA_COMBINE_SEMI_VOICED[] = new byte[] {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
+ 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+
+ private static final int HW_KATAKANA_VOICED_MARK = 0xFF9E;
+ private static final int HW_KATAKANA_SEMI_VOICED_MARK = 0xFF9F;
+
+ private int prevChar = -1;
+ private int inputOff = 0;
+
+ /** Default constructor that takes a {@link Reader}. */
+ public CJKWidthCharFilter(Reader in) {
+ super(in);
+ }
+
+ @Override
+ public int read() throws IOException {
+ while(true) {
+ final int ch = input.read();
+ if (ch == -1) {
+ // reached end of the input
+ int ret = prevChar;
+ prevChar = ch;
+ return ret;
+ }
+
+ inputOff++;
+ int ret = -1;
+ // if the current char is a voice mark, then try to combine it with the previous char.
+ if (ch == HW_KATAKANA_SEMI_VOICED_MARK || ch == HW_KATAKANA_VOICED_MARK) {
+ final int combinedChar = combineVoiceMark(prevChar, ch);
+ if (prevChar != combinedChar) {
+ // successfully combined. returns the combined char immediately
+ prevChar = -1;
+ // offset needs to be corrected
+ final int prevCumulativeDiff = getLastCumulativeDiff();
+ addOffCorrectMap(inputOff - 1 - prevCumulativeDiff, prevCumulativeDiff + 1);
+ return combinedChar;
+ }
+ }
+
+ if (prevChar != -1) {
+ ret = prevChar;
+ }
+
+ if (ch >= 0xFF01 && ch <= 0xFF5E) {
+ // Fullwidth ASCII variants
+ prevChar = ch - 0xFEE0;
+ } else if (ch >= 0xFF65 && ch <= 0xFF9F) {
+ // Halfwidth Katakana variants
+ prevChar = KANA_NORM[ch - 0xFF65];
+ } else {
+ // no need to normalize
+ prevChar = ch;
+ }
+
+ if (ret != -1) {
+ return ret;
+ }
+ }
+ }
+
+ /** returns combined char if we successfully combined the voice mark, otherwise original char */
+ private int combineVoiceMark(int ch, int voiceMark) {
+ assert voiceMark == HW_KATAKANA_SEMI_VOICED_MARK || voiceMark == HW_KATAKANA_VOICED_MARK;
+ if (ch >= 0x30A6 && ch <= 0x30FD) {
+ ch += (voiceMark == HW_KATAKANA_SEMI_VOICED_MARK)
+ ? KANA_COMBINE_SEMI_VOICED[prevChar - 0x30A6]
+ : KANA_COMBINE_VOICED[prevChar - 0x30A6];
+ }
+ return ch;
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ int numRead = 0;
+ for(int i = off; i < off + len; i++) {
+ int c = read();
+ if (c == -1) break;
+ cbuf[i] = (char) c;
+ numRead++;
+ }
+ return numRead == 0 ? -1 : numRead;
+ }
+
+}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
new file mode 100644
index 0000000..4f8bf09
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthCharFilterFactory.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.cjk;
+
+
+import org.apache.lucene.analysis.CharFilterFactory;
+
+import java.io.Reader;
+import java.util.Map;
+
+/**
+ * Factory for {@link CJKWidthCharFilter}.
+ * @lucene.spi {@value #NAME}
+ */
+public class CJKWidthCharFilterFactory extends CharFilterFactory {
+
+ /** SPI name */
+ public static final String NAME = "cjkWidth";
+
+ /** Creates a new CJKWidthCharFilterFactory */
+ public CJKWidthCharFilterFactory(Map<String,String> args) {
+ super(args);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /** Default ctor for compatibility with SPI */
+ public CJKWidthCharFilterFactory() {
+ throw defaultCtorException();
+ }
+
+ @Override
+ public Reader create(Reader input) {
+ return new CJKWidthCharFilter(input);
+ }
+
+ @Override
+ public Reader normalize(Reader input) {
+ return create(input);
+ }
+
+}
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory
index b53db41..c9f43ec 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.CharFilterFactory
@@ -15,5 +15,6 @@
org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory
org.apache.lucene.analysis.charfilter.MappingCharFilterFactory
+org.apache.lucene.analysis.cjk.CJKWidthCharFilterFactory
org.apache.lucene.analysis.fa.PersianCharFilterFactory
org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java
new file mode 100644
index 0000000..92f9851
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilter.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cjk;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+public class TestCJKWidthCharFilter extends BaseTokenStreamTestCase {
+ /**
+ * Full-width ASCII forms normalized to half-width (basic latin)
+ */
+ public void testFullWidthASCII() throws IOException {
+ CharFilter reader = new CJKWidthCharFilter(new StringReader("Test 1234"));
+ TokenStream ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"Test", "1234"}, new int[]{0, 5}, new int[]{4, 9}, 9);
+ }
+
+ /**
+ * Half-width katakana forms normalized to standard katakana.
+ * A bit trickier in some cases, since half-width forms are decomposed
+ * and voice marks need to be recombined with a preceding base form.
+ */
+ public void testHalfWidthKana() throws IOException {
+ CharFilter reader = new CJKWidthCharFilter(new StringReader("カタカナ"));
+ TokenStream ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"カタカナ"}, new int[]{0}, new int[]{4}, 4);
+
+ reader = new CJKWidthCharFilter(new StringReader("ヴィッツ"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"ヴィッツ"}, new int[]{0}, new int[]{5}, 5);
+
+ reader = new CJKWidthCharFilter(new StringReader("パナソニック"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"パナソニック"}, new int[]{0}, new int[]{7}, 7);
+
+ reader = new CJKWidthCharFilter(new StringReader("ヴィッツ パナソニック"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"ヴィッツ", "パナソニック"}, new int[]{0, 6}, new int[]{5, 13}, 13);
+ }
+
+ /**
+ * Input may contain orphan voiced marks that cannot be combined with the previous character.
+ */
+ public void testOrphanVoiceMark() throws Exception {
+ CharFilter reader = new CJKWidthCharFilter(new StringReader("ア゙ィッツ"));
+ TokenStream ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"ア\u3099ィッツ"}, new int[]{0}, new int[]{5}, 5);
+
+ reader = new CJKWidthCharFilter(new StringReader("゙ィッツ"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"\u3099ィッツ"}, new int[]{0}, new int[]{4}, 4);
+
+ reader = new CJKWidthCharFilter(new StringReader("ア゚ナソニック"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"ア\u309Aナソニック"}, new int[]{0}, new int[]{7}, 7);
+
+ reader = new CJKWidthCharFilter(new StringReader("゚ナソニック"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"\u309Aナソニック"}, new int[]{0}, new int[]{6}, 6);
+ }
+
+ public void testComplexInput() throws Exception {
+ CharFilter reader = new CJKWidthCharFilter(new StringReader("Test 1234"));
+ TokenStream ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"Test", "1234"}, new int[]{0, 5}, new int[]{4, 9}, 9);
+
+ reader = new CJKWidthCharFilter(new StringReader("カタカナ ヴィッツ パナソニック"));
+ ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{"カタカナ", "ヴィッツ", "パナソニック"}, new int[]{0, 5, 11}, new int[]{4, 10, 18}, 18);
+ }
+
+ public void testEmptyInput() throws Exception {
+ CharFilter reader = new CJKWidthCharFilter(new StringReader(""));
+ TokenStream ts = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(ts, new String[]{});
+ }
+
+ public void testRandom() throws Exception {
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, tokenizer);
+ }
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ return new CJKWidthCharFilter(reader);
+ }
+ };
+ int numRounds = RANDOM_MULTIPLIER * 1000;
+ checkRandomData(random(), analyzer, numRounds);
+ analyzer.close();
+ }
+
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java
new file mode 100644
index 0000000..7770434
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthCharFilterFactory.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cjk;
+
+import org.apache.lucene.analysis.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.Reader;
+import java.io.StringReader;
+
+/**
+ * Simple tests to ensure {@link CJKWidthCharFilter} is working
+ */
+public class TestCJKWidthCharFilterFactory extends BaseTokenStreamFactoryTestCase {
+ public void test() throws Exception {
+ Reader reader = charFilterFactory("cjkWidth").create(new StringReader("Test 1234"));
+ TokenStream stream = whitespaceMockTokenizer(reader);
+ assertTokenStreamContents(stream, new String[] { "Test", "1234" });
+ }
+
+ /** Test that bogus arguments result in exception */
+ public void testBogusArguments() throws Exception {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+ charFilterFactory("cjkWidth", "bogusArg", "bogusValue");
+ });
+ assertTrue(expected.getMessage().contains("Unknown parameters"));
+ }
+}