You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2019/06/21 01:26:34 UTC
[lucene-solr] branch branch_8x updated: LUCENE-8866: remove kuromoji/tools dependency on ICU

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 2adc8c6  LUCENE-8866: remove kuromoji/tools dependency on ICU
2adc8c6 is described below

commit 2adc8c6c13d1a74c3a371c2341a05507e893dabf
Author: Robert Muir <rm...@apache.org>
AuthorDate: Thu Jun 20 21:20:17 2019 -0400

    LUCENE-8866: remove kuromoji/tools dependency on ICU
---
 lucene/analysis/kuromoji/build.xml                        | 14 +-------------
 .../analysis/ja/util/TokenInfoDictionaryBuilder.java      | 15 ++++++---------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
index 094e2bd..2d531f8 100644
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@@ -69,13 +69,8 @@
            originalfile="${dict.src.dir}/Noun.proper.csv"/>
   </target>
 
-  <path id="tools.dependencies">
-    <fileset dir="../icu/lib"/>
-  </path>
-
   <path id="tools.classpath">
     <path refid="classpath"/>
-    <path refid="tools.dependencies"/>
     <pathelement location="${build.dir}/classes/java"/>
     <pathelement location="${build.dir}/classes/tools"/>
   </path>
@@ -108,14 +103,7 @@
     </sequential>
   </target>
 
-   <!-- we don't actually need to compile this thing, we just want its lib -->
-   <target name="resolve-icu">
-     <ant dir="../icu/" target="resolve" inheritAll="false">
-       <propertyset refid="uptodate.and.compiled.properties"/>
-     </ant>
-   </target>
-
-  <target name="compile-tools" depends="resolve-icu, compile-core, common.compile-tools">
+  <target name="compile-tools" depends="compile-core, common.compile-tools">
     <compile
       srcdir="src/tools/java"
       destdir="${build.dir}/classes/tools">
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
index 465a432..dc2eac3 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@@ -26,6 +26,7 @@ import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
+import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -38,8 +39,6 @@ import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 
-import com.ibm.icu.text.Normalizer2;
-
 /**
  */
 public class TokenInfoDictionaryBuilder {
@@ -49,16 +48,14 @@ public class TokenInfoDictionaryBuilder {
   
   private String encoding = "euc-jp";
   
-  private boolean normalizeEntries = false;
-  private Normalizer2 normalizer;
+  private Normalizer.Form normalForm;
   
   private DictionaryFormat format = DictionaryFormat.IPADIC;
   
   public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
     this.format = format;
     this.encoding = encoding;
-    this.normalizeEntries = normalizeEntries;
-    this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
+    this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
   }
   
   public TokenInfoDictionaryWriter build(String dirname) throws IOException {
@@ -103,13 +100,13 @@ public class TokenInfoDictionaryBuilder {
         lines.add(formatted);
         
         // NFKC normalize dictionary entry
-        if (normalizeEntries) {
-          if (normalizer.isNormalized(entry[0])){
+        if (normalForm != null) {
+          if (Normalizer.isNormalized(entry[0], normalForm)){
             continue;
           }
           String[] normalizedEntry = new String[entry.length];
           for (int i = 0; i < entry.length; i++) {
-            normalizedEntry[i] = normalizer.normalize(entry[i]);
+            normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
           }
           
           formatted = formatEntry(normalizedEntry);