You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2021/03/25 23:32:50 UTC
[lucene] branch main updated: LUCENE-9853: Use CJKWidthCharFilter
as the default character width normalizer in JapaneseAnalyzer (#26)
This is an automated email from the ASF dual-hosted git repository.
tomoko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new ea74ffb LUCENE-9853: Use CJKWidthCharFilter as the default character width normalizer in JapaneseAnalyzer (#26)
ea74ffb is described below
commit ea74ffb98401116330081497efecc65cb0482396
Author: Tomoko Uchida <to...@gmail.com>
AuthorDate: Fri Mar 26 08:32:42 2021 +0900
LUCENE-9853: Use CJKWidthCharFilter as the default character width normalizer in JapaneseAnalyzer (#26)
---
lucene/CHANGES.txt | 3 +++
.../org/apache/lucene/analysis/ja/JapaneseAnalyzer.java | 17 +++++++++++++----
.../apache/lucene/analysis/ja/TestJapaneseAnalyzer.java | 16 ++++++++++++++++
3 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index e3a146e..101fc07 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -95,6 +95,9 @@ API Changes
* LUCENE-9796: SortedDocValues no longer extends BinaryDocValues, as binaryValue() was not performant.
See MIGRATE.md for details. (Robert Muir)
+* LUCENE-9853: JapaneseAnalyzer should use CJKWidthCharFilter for full-width and half-width character normalization.
+ (Tomoko Uchida)
+
Improvements
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
index 4752dc1..e6290b9 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
@@ -17,6 +17,7 @@
package org.apache.lucene.analysis.ja;
import java.io.IOException;
+import java.io.Reader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
@@ -25,7 +26,7 @@ import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
@@ -95,7 +96,6 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, true, mode);
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
- stream = new CJKWidthFilter(stream);
stream = new StopFilter(stream, stopwords);
stream = new JapaneseKatakanaStemFilter(stream);
stream = new LowerCaseFilter(stream);
@@ -104,8 +104,17 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
- TokenStream result = new CJKWidthFilter(in);
- result = new LowerCaseFilter(result);
+ TokenStream result = new LowerCaseFilter(in);
return result;
}
+
+ @Override
+ protected Reader initReader(String fieldName, Reader reader) {
+ return new CJKWidthCharFilter(reader);
+ }
+
+ @Override
+ protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+ return new CJKWidthCharFilter(reader);
+ }
}
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
index 4c92312..e5536fc 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
@@ -146,6 +146,22 @@ public class TestJapaneseAnalyzer extends BaseTokenStreamTestCase {
a.close();
}
+ public void testCharWidthNormalization() throws Exception {
+ final Analyzer a =
+ new JapaneseAnalyzer(
+ TestJapaneseTokenizer.readDict(),
+ Mode.SEARCH,
+ JapaneseAnalyzer.getDefaultStopSet(),
+ JapaneseAnalyzer.getDefaultStopTags());
+ assertTokenStreamContents(
+ a.tokenStream("foo", "新橋6-20-1"),
+ new String[] {"新橋", "6", "20", "1"},
+ new int[] {0, 2, 4, 7},
+ new int[] {2, 3, 6, 8},
+ 8);
+ a.close();
+ }
+
// LUCENE-3897: this string (found by running all jawiki
// XML through JapaneseAnalyzer) caused AIOOBE
public void testCuriousString() throws Exception {