You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by so...@apache.org on 2019/06/27 02:34:07 UTC
[lucene-solr] branch master updated: LUCENE-8871: promote kuromoji tools to main jar

This is an automated email from the ASF dual-hosted git repository.

sokolov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/master by this push:
     new 024e200  LUCENE-8871: promote kuromoji tools to main jar
024e200 is described below

commit 024e200bb908c8c0b52af98c5d51e23c8faf074c
Author: Michael Sokolov <so...@falutin.net>
AuthorDate: Sat Jun 22 11:13:02 2019 -0400

    LUCENE-8871: promote kuromoji tools to main jar
---
 lucene/analysis/kuromoji/build.xml                 |  51 +-------
 .../analysis/ja/dict/TokenInfoDictionary.java      |  14 +--
 .../analysis/ja/util/BinaryDictionaryWriter.java   |  70 +++++------
 .../ja/util/CharacterDefinitionWriter.java         |  23 ++--
 .../analysis/ja/util/ConnectionCostsBuilder.java   |  62 ++++++++++
 .../analysis/ja/util/ConnectionCostsWriter.java    |  28 ++---
 .../lucene/analysis/ja/util/DictionaryBuilder.java |  52 ++++++++
 .../ja/util/TokenInfoDictionaryBuilder.java        | 127 +++++++------------
 .../ja/util/TokenInfoDictionaryWriter.java         |  16 +--
 .../analysis/ja/util/UnknownDictionaryBuilder.java | 119 ++++++++++++++++++
 .../analysis/ja/util/UnknownDictionaryWriter.java  |   4 +-
 .../analysis/ja/dict/TokenInfoDictionary$fst.dat   | Bin 1698570 -> 1698570 bytes
 ...ictionary.java => TokenInfoDictionaryTest.java} |  91 +++++++++++---
 .../analysis/ja/dict/UnknownDictionaryTest.java    |   0
 .../analysis/ja/util/ConnectionCostsBuilder.java   |  68 -----------
 .../lucene/analysis/ja/util/DictionaryBuilder.java |  85 -------------
 .../analysis/ja/util/UnknownDictionaryBuilder.java | 135 ---------------------
 .../analysis/ja/dict/TokenInfoDictionaryTest.java  |  85 -------------
 18 files changed, 424 insertions(+), 606 deletions(-)

diff --git a/lucene/analysis/kuromoji/build.xml b/lucene/analysis/kuromoji/build.xml
index 2d531f8..7afa31d 100644
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@@ -26,7 +26,6 @@
   <!-- currently whether rat detects this as binary or not
        is platform dependent?! -->
   <property name="rat.excludes" value="**/*.txt,**/bocchan.utf-8"/>
-  <property name="rat.additional-includes" value="src/tools/**"/>
 
   <!-- we don't want to pull in ipadic/naist etc -->
   <property name="ivy.default.configuration" value="default"/>
@@ -52,6 +51,9 @@
   <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
 
   <path id="classpath">
+    <dirset dir="${build.dir}">
+      <include name="classes/java"/>
+    </dirset>
     <pathelement path="${analyzers-common.jar}"/>
     <path refid="base.classpath"/>
   </path>
@@ -69,28 +71,14 @@
            originalfile="${dict.src.dir}/Noun.proper.csv"/>
   </target>
 
-  <path id="tools.classpath">
-    <path refid="classpath"/>
-    <pathelement location="${build.dir}/classes/java"/>
-    <pathelement location="${build.dir}/classes/tools"/>
-  </path>
-
-  <path id="tools.test.classpath">
-    <path refid="tools.classpath"/>
-    <path refid="test.base.classpath"/>
-    <pathelement location="${build.dir}/classes/tools-test"/>
-  </path>
-
-  <target name="build-dict" depends="compile-tools, patch-dict">
+  <target name="build-dict" depends="compile, patch-dict">
     <sequential>
       <delete verbose="true">
         <fileset dir="${resources.dir}/org/apache/lucene/analysis/ja/dict" includes="**/*"/>
       </delete>
       <!-- TODO: optimize the dictionary construction a bit so that you don't need 1G -->
       <java fork="true" failonerror="true" maxmemory="1g" classname="org.apache.lucene.analysis.ja.util.DictionaryBuilder">
-        <classpath>
-          <path refid="tools.classpath"/>
-        </classpath>
+        <classpath refid="classpath"/>
         <assertions>
           <enable package="org.apache.lucene"/>
         </assertions>
@@ -103,34 +91,7 @@
     </sequential>
   </target>
 
-  <target name="compile-tools" depends="compile-core, common.compile-tools">
-    <compile
-      srcdir="src/tools/java"
-      destdir="${build.dir}/classes/tools">
-      <classpath>
-        <path refid="tools.classpath"/>
-      </classpath>
-    </compile>
-  </target>
-
-  <target name="compile-tools-tests" depends="compile-tools">
-    <compile
-      srcdir="src/tools/test"
-      destdir="${build.dir}/classes/tools-test">
-      <classpath>
-        <path refid="tools.test.classpath"/>
-        <pathelement path="src/tools/test"/>
-      </classpath>
-     </compile>
-  </target>
-
-  <target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
-    <test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
-  </target>
-
-  <target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
-  <!-- TODO: not until we properly make 'test-tools' work with clover etc
-  <target name="test" depends="module-build.test, test-tools"/> -->
+  <target name="compile-test" depends="module-build.compile-test"/>
 
   <target name="regenerate" depends="build-dict"/>
 
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
index 662ebb5..9a201a9 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.io.IOException;
 
 import org.apache.lucene.store.InputStreamDataInput;
-import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 
@@ -43,20 +42,9 @@ public final class TokenInfoDictionary extends BinaryDictionary {
    */
   TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
     super(resourceScheme, resourcePath);
-    InputStream is = null;
     FST<Long> fst;
-    boolean success = false;
-    try {
-      is = getResource(FST_FILENAME_SUFFIX);
-      is = new BufferedInputStream(is);
+    try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
       fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
-      success = true;
-    } finally {
-      if (success) {
-        IOUtils.close(is);
-      } else {
-        IOUtils.closeWhileHandlingException(is);
-      }
     }
     // TODO: some way to configure?
     this.fst = new TokenInfoFST(fst, true);
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
similarity index 83%
rename from lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
index a6ef6bb..1aec333 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
@@ -18,13 +18,13 @@ package org.apache.lucene.analysis.ja.util;
 
 
 import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
 
 import org.apache.lucene.codecs.CodecUtil;
@@ -37,14 +37,14 @@ import org.apache.lucene.analysis.ja.dict.BinaryDictionary;
 public abstract class BinaryDictionaryWriter {
   private final static int ID_LIMIT = 8192;
 
-  protected final Class<? extends BinaryDictionary> implClazz;
+  private final Class<? extends BinaryDictionary> implClazz;
   protected ByteBuffer buffer;
   private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
   private int[] targetMap = new int[8192];
   private int[] targetMapOffsets = new int[8192];
   private final ArrayList<String> posDict = new ArrayList<>();
 
-  public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
+  BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
     this.implClazz = implClazz;
     buffer = ByteBuffer.allocate(size);
   }
@@ -199,7 +199,7 @@ public abstract class BinaryDictionaryWriter {
   }
   
   private String toKatakana(String s) {
-    char text[] = new char[s.length()];
+    char[] text = new char[s.length()];
     for (int i = 0; i < s.length(); i++) {
       char ch = s.charAt(i);
       if (ch > 0x3040 && ch < 0x3097) {
@@ -211,7 +211,7 @@ public abstract class BinaryDictionaryWriter {
     return new String(text);
   }
   
-  public static int sharedPrefix(String left, String right) {
+  private static int sharedPrefix(String left, String right) {
     int len = left.length() < right.length() ? left.length() : right.length();
     for (int i = 0; i < len; i++)
       if (left.charAt(i) != right.charAt(i))
@@ -219,7 +219,7 @@ public abstract class BinaryDictionaryWriter {
     return len;
   }
   
-  public void addMapping(int sourceId, int wordId) {
+  void addMapping(int sourceId, int wordId) {
     if (wordId <= lastWordId) {
       throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
     }
@@ -241,8 +241,8 @@ public abstract class BinaryDictionaryWriter {
     lastWordId = wordId;
   }
 
-  protected final String getBaseFileName(String baseDir) {
-    return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
+  final String getBaseFileName() {
+    return implClazz.getName().replace('.', '/');
   }
   
   /**
@@ -251,20 +251,19 @@ public abstract class BinaryDictionaryWriter {
    * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
    * @throws IOException if an I/O error occurs writing the dictionary files
    */
-  public void write(String baseDir) throws IOException {
-    final String baseName = getBaseFileName(baseDir);
-    writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
-    writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
-    writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+  public void write(Path baseDir) throws IOException {
+    final String baseName = getBaseFileName();
+    writeDictionary(baseDir.resolve(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX));
+    writeTargetMap(baseDir.resolve(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX));
+    writePosDict(baseDir.resolve(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX));
   }
   
   // TODO: maybe this int[] should instead be the output to the FST...
-  protected void writeTargetMap(String filename) throws IOException {
-    new File(filename).getParentFile().mkdirs();
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
-      final DataOutput out = new OutputStreamDataOutput(os);
+  private void writeTargetMap(Path path) throws IOException {
+    Files.createDirectories(path.getParent());
+    try (OutputStream os = Files.newOutputStream(path);
+         OutputStream bos = new BufferedOutputStream(os)) {
+      final DataOutput out = new OutputStreamDataOutput(bos);
       CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
       
       final int numSourceIds = lastSourceId + 1;
@@ -285,17 +284,14 @@ public abstract class BinaryDictionaryWriter {
       if (sourceId != numSourceIds) {
         throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
       }
-    } finally {
-      os.close();
     }
   }
   
-  protected void writePosDict(String filename) throws IOException {
-    new File(filename).getParentFile().mkdirs();
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
-      final DataOutput out = new OutputStreamDataOutput(os);
+  private void writePosDict(Path path) throws IOException {
+    Files.createDirectories(path.getParent());
+    try (OutputStream os = Files.newOutputStream(path);
+         OutputStream bos = new BufferedOutputStream(os)) {
+      final DataOutput out = new OutputStreamDataOutput(bos);
       CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
       out.writeVInt(posDict.size());
       for (String s : posDict) {
@@ -304,7 +300,7 @@ public abstract class BinaryDictionaryWriter {
           out.writeByte((byte)0);
           out.writeByte((byte)0);
         } else {
-          String data[] = CSVUtil.parse(s);
+          String[] data = CSVUtil.parse(s);
           if (data.length != 3) {
             throw new IllegalArgumentException("Malformed pos/inflection: " + s + "; expected 3 characters");
           }
@@ -313,25 +309,21 @@ public abstract class BinaryDictionaryWriter {
           out.writeString(data[2]);
         }
       }
-    } finally {
-      os.close();
     }
   }
   
-  protected void writeDictionary(String filename) throws IOException {
-    new File(filename).getParentFile().mkdirs();
-    final FileOutputStream os = new FileOutputStream(filename);
-    try {
-      final DataOutput out = new OutputStreamDataOutput(os);
+  private void writeDictionary(Path path) throws IOException {
+    Files.createDirectories(path.getParent());
+    try (OutputStream os = Files.newOutputStream(path);
+         OutputStream bos = new BufferedOutputStream(os)) {
+      final DataOutput out = new OutputStreamDataOutput(bos);
       CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
       out.writeVInt(buffer.position());
-      final WritableByteChannel channel = Channels.newChannel(os);
+      final WritableByteChannel channel = Channels.newChannel(bos);
       // Write Buffer
       buffer.flip();  // set position to 0, set limit to current position
       channel.write(buffer);
       assert buffer.remaining() == 0L;
-    } finally {
-      os.close();
     }
   }
 }
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java
similarity index 80%
rename from lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java
rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java
index deed5af..60edabe 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/CharacterDefinitionWriter.java
@@ -18,10 +18,10 @@ package org.apache.lucene.analysis.ja.util;
 
 
 import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Arrays;
 
 import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
@@ -40,7 +40,7 @@ public final class CharacterDefinitionWriter {
   /**
    * Constructor for building. TODO: remove write access
    */
-  public CharacterDefinitionWriter() {
+  CharacterDefinitionWriter() {
     Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
   }
   
@@ -51,7 +51,7 @@ public final class CharacterDefinitionWriter {
    *            code point
    * @param characterClassName character class name
    */
-  public void putCharacterCategory(int codePoint, String characterClassName) {
+  void putCharacterCategory(int codePoint, String characterClassName) {
     characterClassName = characterClassName.split(" ")[0]; // use first
     // category
     // class
@@ -63,20 +63,17 @@ public final class CharacterDefinitionWriter {
     characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
   }
   
-  public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+  void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
     final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
     invokeMap[characterClass] = invoke == 1;
     groupMap[characterClass] = group == 1;
     // TODO: length def ignored
   }
   
-  public void write(String baseDir) throws IOException {
-    String filename = baseDir + File.separator +
-      CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX;
-    new File(filename).getParentFile().mkdirs();
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
+  public void write(Path baseDir) throws IOException {
+    Path path = baseDir.resolve(CharacterDefinition.class.getName().replace('.', '/') + CharacterDefinition.FILENAME_SUFFIX);
+    Files.createDirectories(path.getParent());
+    try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))){
       final DataOutput out = new OutputStreamDataOutput(os);
       CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
       out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
@@ -87,8 +84,6 @@ public final class CharacterDefinitionWriter {
         );
         out.writeByte(b);
       }
-    } finally {
-      os.close();
     }
   }
   
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java
new file mode 100644
index 0000000..d1bc846
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.util;
+
+
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+public class ConnectionCostsBuilder {
+
+  private ConnectionCostsBuilder() {
+  }
+
+  public static ConnectionCostsWriter build(Path path) throws IOException {
+    try (Reader reader = Files.newBufferedReader(path, StandardCharsets.US_ASCII);
+         LineNumberReader lineReader = new LineNumberReader(reader)) {
+
+      String line = lineReader.readLine();
+      String[] dimensions = line.split("\\s+");
+
+      assert dimensions.length == 2;
+
+      int forwardSize = Integer.parseInt(dimensions[0]);
+      int backwardSize = Integer.parseInt(dimensions[1]);
+
+      assert forwardSize > 0 && backwardSize > 0;
+
+      ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
+
+      while ((line = lineReader.readLine()) != null) {
+        String[] fields = line.split("\\s+");
+
+        assert fields.length == 3;
+
+        int forwardId = Integer.parseInt(fields[0]);
+        int backwardId = Integer.parseInt(fields[1]);
+        int cost = Integer.parseInt(fields[2]);
+
+        costs.add(forwardId, backwardId, cost);
+      }
+      return costs;
+    }
+  }
+}
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
similarity index 75%
rename from lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
index 6ad8a68..a629fff 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java
@@ -18,10 +18,10 @@ package org.apache.lucene.analysis.ja.util;
 
 
 import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 
 import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
 
@@ -37,7 +37,7 @@ public final class ConnectionCostsWriter {
   /**
    * Constructor for building. TODO: remove write access
    */
-  public ConnectionCostsWriter(int forwardSize, int backwardSize) {
+  ConnectionCostsWriter(int forwardSize, int backwardSize) {
     this.forwardSize = forwardSize;
     this.backwardSize = backwardSize;
     this.costs = new short[backwardSize][forwardSize];
@@ -47,14 +47,12 @@ public final class ConnectionCostsWriter {
     this.costs[backwardId][forwardId] = (short)cost;
   }
   
-  public void write(String baseDir) throws IOException {
-    String filename = baseDir + File.separator +
-      ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX;
-    new File(filename).getParentFile().mkdirs();
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
-      final DataOutput out = new OutputStreamDataOutput(os);
+  public void write(Path baseDir) throws IOException {
+    Files.createDirectories(baseDir);
+    String fileName = ConnectionCosts.class.getName().replace('.', '/') + ConnectionCosts.FILENAME_SUFFIX;
+    try (OutputStream os = Files.newOutputStream(baseDir.resolve(fileName));
+         OutputStream bos = new BufferedOutputStream(os)) {
+      final DataOutput out = new OutputStreamDataOutput(bos);
       CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
       out.writeVInt(forwardSize);
       out.writeVInt(backwardSize);
@@ -62,14 +60,12 @@ public final class ConnectionCostsWriter {
       assert costs.length == backwardSize;
       for (short[] a : costs) {
         assert a.length == forwardSize;
-        for (int i = 0; i < a.length; i++) {
-          int delta = (int)a[i] - last;
+        for (short cost : a) {
+          int delta = (int) cost - last;
           out.writeZInt(delta);
-          last = a[i];
+          last = cost;
         }
       }
-    } finally {
-      os.close();
     }
   }
   
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
new file mode 100644
index 0000000..bdf8368
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.util;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+public class DictionaryBuilder {
+
+  public enum DictionaryFormat { IPADIC, UNIDIC }
+
+  private DictionaryBuilder() {
+  }
+
+  public static void build(DictionaryFormat format, Path inputDir, Path outputDir, String encoding, boolean normalizeEntry) throws IOException {
+    new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry)
+        .build(inputDir)
+        .write(outputDir);
+
+    new UnknownDictionaryBuilder(encoding)
+        .build(inputDir)
+        .write(outputDir);
+
+    ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
+        .write(outputDir);
+  }
+
+  public static void main(String[] args) throws IOException {
+    DictionaryFormat format = DictionaryFormat.valueOf(args[0].toUpperCase());
+    String inputDirName = args[1];
+    String outputDirName = args[2];
+    String inputEncoding = args[3];
+    boolean normalizeEntries = Boolean.parseBoolean(args[4]);
+    DictionaryBuilder.build(format, Paths.get(inputDirName), Paths.get(outputDirName), inputEncoding, normalizeEntries);
+  }
+}
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
similarity index 58%
rename from lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
index dc2eac3..bbed37b 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@@ -16,22 +16,18 @@
  */
 package org.apache.lucene.analysis.ja.util;
 
-
 import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FilenameFilter;
 import java.io.IOException;
-import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
 import org.apache.lucene.util.IntsRefBuilder;
@@ -42,89 +38,63 @@ import org.apache.lucene.util.fst.PositiveIntOutputs;
 /**
  */
 public class TokenInfoDictionaryBuilder {
-  
+
+  private final String encoding;
+  private final Normalizer.Form normalForm;
+  private final DictionaryFormat format;
+
   /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
   private int offset = 0;
-  
-  private String encoding = "euc-jp";
-  
-  private Normalizer.Form normalForm;
-  
-  private DictionaryFormat format = DictionaryFormat.IPADIC;
-  
+
   public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
     this.format = format;
     this.encoding = encoding;
-    this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
+    normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
   }
   
-  public TokenInfoDictionaryWriter build(String dirname) throws IOException {
-    FilenameFilter filter = new FilenameFilter() {
-      @Override
-      public boolean accept(File dir, String name) {
-        return name.endsWith(".csv");
-      }
-    };
-    ArrayList<File> csvFiles = new ArrayList<>();
-    for (File file : new File(dirname).listFiles(filter)) {
-      csvFiles.add(file);
+  public TokenInfoDictionaryWriter build(Path dir) throws IOException {
+    try (Stream<Path> files = Files.list(dir)) {
+      List<Path> csvFiles = files
+          .filter(path -> path.getFileName().toString().endsWith(".csv"))
+          .sorted()
+          .collect(Collectors.toList());
+      return buildDictionary(csvFiles);
     }
-    Collections.sort(csvFiles);
-    return buildDictionary(csvFiles);
   }
 
-  public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
+  private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
     TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
-    
+    Charset cs = Charset.forName(encoding);
     // all lines in the file
-    System.out.println("  parse...");
     List<String[]> lines = new ArrayList<>(400000);
-    for (File file : csvFiles){
-      FileInputStream inputStream = new FileInputStream(file);
-      Charset cs = Charset.forName(encoding);
-      CharsetDecoder decoder = cs.newDecoder()
-          .onMalformedInput(CodingErrorAction.REPORT)
-          .onUnmappableCharacter(CodingErrorAction.REPORT);
-      InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
-      BufferedReader reader = new BufferedReader(streamReader);
-      
-      String line = null;
-      while ((line = reader.readLine()) != null) {
-        String[] entry = CSVUtil.parse(line);
+    for (Path path : csvFiles) {
+      try (BufferedReader reader = Files.newBufferedReader(path, cs)) {
+        String line;
+        while ((line = reader.readLine()) != null) {
+          String[] entry = CSVUtil.parse(line);
 
-        if(entry.length < 13) {
-          throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
-        }
-        
-        String[] formatted = formatEntry(entry);
-        lines.add(formatted);
-        
-        // NFKC normalize dictionary entry
-        if (normalForm != null) {
-          if (Normalizer.isNormalized(entry[0], normalForm)){
-            continue;
+          if (entry.length < 13) {
+            throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
           }
-          String[] normalizedEntry = new String[entry.length];
-          for (int i = 0; i < entry.length; i++) {
-            normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
+
+          lines.add(formatEntry(entry));
+
+          if (normalForm != null) {
+            if (Normalizer.isNormalized(entry[0], normalForm)) {
+              continue;
+            }
+            String[] normalizedEntry = new String[entry.length];
+            for (int i = 0; i < entry.length; i++) {
+              normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
+            }
+            lines.add(formatEntry(normalizedEntry));
           }
-          
-          formatted = formatEntry(normalizedEntry);
-          lines.add(formatted);
         }
       }
     }
     
-    System.out.println("  sort...");
-
     // sort by term: we sorted the files already and use a stable sort.
-    Collections.sort(lines, new Comparator<String[]>() {
-      public int compare(String[] left, String[] right) {
-        return left[0].compareTo(right[0]);
-      }
-    });
-    
-    System.out.println("  encode...");
+    lines.sort(Comparator.comparing(entry -> entry[0]));
 
     PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15, false);
@@ -132,13 +102,12 @@ public class TokenInfoDictionaryBuilder {
     long ord = -1; // first ord will be 0
     String lastValue = null;
 
-    // build tokeninfo dictionary
+    // build token info dictionary
     for (String[] entry : lines) {
       int next = dictionary.put(entry);
         
       if(next == offset){
-        System.out.println("Failed to process line: " + Arrays.toString(entry));
-        continue;
+        throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
       }
       
       String token = entry[0];
@@ -153,16 +122,10 @@ public class TokenInfoDictionaryBuilder {
         }
         fstBuilder.add(scratch.get(), ord);
       }
-      dictionary.addMapping((int)ord, offset);
+      dictionary.addMapping((int) ord, offset);
       offset = next;
     }
-    
-    final FST<Long> fst = fstBuilder.finish();
-    
-    System.out.print("  " + fstBuilder.getNodeCount() + " nodes, " + fstBuilder.getArcCount() + " arcs, " + fst.ramBytesUsed() + " bytes...  ");
-    dictionary.setFST(fst);
-    System.out.println(" done");
-    
+    dictionary.setFST(fstBuilder.finish());
     return dictionary;
   }
   
@@ -191,7 +154,7 @@ public class TokenInfoDictionaryBuilder {
    * 13  - surface reading
    */
   
-  public String[] formatEntry(String[] features) {
+  private String[] formatEntry(String[] features) {
     if (this.format == DictionaryFormat.IPADIC) {
       return features;
     } else {
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java
similarity index 76%
rename from lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java
rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java
index 6c2a28c..81cad4f 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryWriter.java
@@ -20,7 +20,7 @@ package org.apache.lucene.analysis.ja.util;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.util.Objects;
 
 import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
 import org.apache.lucene.util.fst.FST;
@@ -28,23 +28,23 @@ import org.apache.lucene.util.fst.FST;
 public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
   private FST<Long> fst;
 
-  public TokenInfoDictionaryWriter(int size) {
+  TokenInfoDictionaryWriter(int size) {
     super(TokenInfoDictionary.class, size);
   }
   
   public void setFST(FST<Long> fst) {
+    Objects.requireNonNull(fst, "dictionary must not be empty");
     this.fst = fst;
   }
   
   @Override
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
     super.write(baseDir);
-    writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX);
+    writeFST(baseDir.resolve(getBaseFileName() + TokenInfoDictionary.FST_FILENAME_SUFFIX));
   }
   
-  protected void writeFST(String filename) throws IOException {
-    Path p = Paths.get(filename);
-    Files.createDirectories(p.getParent());
-    fst.save(p);
+  private void writeFST(Path path) throws IOException {
+    Files.createDirectories(path.getParent());
+    fst.save(path);
   }  
 }
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
new file mode 100644
index 0000000..c3abd45
--- /dev/null
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja.util;
+
+
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
+
+public class UnknownDictionaryBuilder {
+  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
+
+  private final String encoding;
+
+  UnknownDictionaryBuilder(String encoding) {
+    this.encoding = encoding;
+  }
+
+  public UnknownDictionaryWriter build(Path dir) throws IOException {
+    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def"));  //Should be only one file
+    readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
+    return unkDictionary;
+  }
+
+  private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
+    return readDictionaryFile(path, encoding);
+  }
+
+  private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
+    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
+
+    List<String[]> lines = new ArrayList<>();
+    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
+         LineNumberReader lineReader = new LineNumberReader(reader)) {
+
+      dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
+
+      String line;
+      while ((line = lineReader.readLine()) != null) {
+        // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
+        // even though the unknown dictionary returns hardcoded null here.
+        final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+        lines.add(parsed);
+      }
+    }
+
+    lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));
+
+    for (String[] entry : lines) {
+      dictionary.put(entry);
+    }
+
+    return dictionary;
+  }
+
+  private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
+    try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
+         LineNumberReader lineReader = new LineNumberReader(reader)) {
+
+      String line;
+      while ((line = lineReader.readLine()) != null) {
+        line = line.replaceAll("^\\s", "");
+        line = line.replaceAll("\\s*#.*", "");
+        line = line.replaceAll("\\s+", " ");
+
+        // Skip empty line or comment line
+        if (line.length() == 0) {
+          continue;
+        }
+
+        if (line.startsWith("0x")) {  // Category mapping
+          String[] values = line.split(" ", 2);  // Split only first space
+
+          if (!values[0].contains("..")) {
+            int cp = Integer.decode(values[0]);
+            dictionary.putCharacterCategory(cp, values[1]);
+          } else {
+            String[] codePoints = values[0].split("\\.\\.");
+            int cpFrom = Integer.decode(codePoints[0]);
+            int cpTo = Integer.decode(codePoints[1]);
+
+            for (int i = cpFrom; i <= cpTo; i++) {
+              dictionary.putCharacterCategory(i, values[1]);
+            }
+          }
+        } else {  // Invoke definition
+          String[] values = line.split(" "); // Consecutive space is merged above
+          String characterClassName = values[0];
+          int invoke = Integer.parseInt(values[1]);
+          int group = Integer.parseInt(values[2]);
+          int length = Integer.parseInt(values[3]);
+          dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
+        }
+      }
+    }
+  }
+}
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java
similarity index 95%
rename from lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java
rename to lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java
index a5819f9..6809825 100644
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryWriter.java
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.ja.util;
 
 
 import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 
 import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
 import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
@@ -58,7 +60,7 @@ public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
   }
   
   @Override
-  public void write(String baseDir) throws IOException {
+  public void write(Path baseDir) throws IOException {
     super.write(baseDir);
     characterDefinition.write(baseDir);
   }
diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
index c06fd4a..9328c53 100644
Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
similarity index 57%
rename from lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java
rename to lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
index eab4ec3..69328d8 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestTokenInfoDictionary.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
@@ -16,16 +16,76 @@
  */
 package org.apache.lucene.analysis.ja.dict;
 
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
 
+import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
 import org.apache.lucene.analysis.ja.util.ToStringUtil;
+import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryBuilder;
+import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryWriter;
 import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntsRefFSTEnum;
-import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
 
-public class TestTokenInfoDictionary extends LuceneTestCase {
+import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
+
+/**
+ * Tests of TokenInfoDictionary build tools; run using ant test-tools
+ */
+public class TokenInfoDictionaryTest extends LuceneTestCase {
+
+  public void testPut() throws Exception {
+    TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
+                                               // "large" id
+                                               "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
+    IntsRef wordIdRef = new IntsRefBuilder().get();
+
+    dict.lookupWordIds(0, wordIdRef);
+    int wordId = wordIdRef.ints[wordIdRef.offset];
+    assertEquals(5000, dict.getLeftId(wordId));
+    assertEquals(5000, dict.getRightId(wordId));
+    assertEquals(3, dict.getWordCost(wordId));
+
+    dict.lookupWordIds(1, wordIdRef);
+    wordId = wordIdRef.ints[wordIdRef.offset];
+    assertEquals(1, dict.getLeftId(wordId));
+    assertEquals(1, dict.getRightId(wordId));
+    assertEquals(2, dict.getWordCost(wordId));
+  }
+
+  private TokenInfoDictionary newDictionary(String... entries) throws Exception {
+    Path dir = createTempDir();
+    try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
+         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
+      for (String entry : entries) {
+        printer.println(entry);
+      }
+    }
+    TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder(DictionaryFormat.IPADIC, "utf-8", true);
+    TokenInfoDictionaryWriter writer = builder.build(dir);
+    writer.write(dir);
+    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
+    // We must also load the other files (in BinaryDictionary) from the correct path
+    return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
+  }
+
+  public void testPutException() {
+    // too few columns
+    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
+    // left id != right id
+    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
+    // left id != right id
+    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
+    // id too large
+    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
+  }
 
   /** enumerates the entire FST/lookup data and just does basic sanity checks */
   public void testEnumerateAll() throws Exception {
@@ -38,17 +98,17 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
     ConnectionCosts matrix = ConnectionCosts.getInstance();
     FST<Long> fst = tid.getFST().getInternalFST();
     IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
-    InputOutput<Long> mapping;
+    IntsRefFSTEnum.InputOutput<Long> mapping;
     IntsRef scratch = new IntsRef();
     while ((mapping = fstEnum.next()) != null) {
       numTerms++;
       IntsRef input = mapping.input;
-      char chars[] = new char[input.length];
+      char[] chars = new char[input.length];
       for (int i = 0; i < chars.length; i++) {
         chars[i] = (char)input.ints[input.offset+i];
       }
       assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
-      
+
       Long output = mapping.output;
       int sourceId = output.intValue();
       // we walk in order, terms, sourceIds, and wordIds should always be increasing
@@ -60,41 +120,41 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
         int wordId = scratch.ints[scratch.offset+i];
         assertTrue(wordId > lastWordId);
         lastWordId = wordId;
-         
+
         String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
         assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
-        
+
         String inflectionForm = tid.getInflectionForm(wordId);
         assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
         if (inflectionForm != null) {
           // check that it's actually an ipadic inflection form
-          assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));          
+          assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
         }
-        
+
         String inflectionType = tid.getInflectionType(wordId);
         assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
         if (inflectionType != null) {
           // check that it's actually an ipadic inflection type
           assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
         }
-        
+
         int leftId = tid.getLeftId(wordId);
         int rightId = tid.getRightId(wordId);
-        
+
         matrix.get(rightId, leftId);
-        
+
         tid.getWordCost(wordId);
-        
+
         String pos = tid.getPartOfSpeech(wordId);
         assertNotNull(pos);
         assertTrue(UnicodeUtil.validUTF16String(pos));
         // check that it's actually an ipadic pos tag
         assertNotNull(ToStringUtil.getPOSTranslation(pos));
-        
+
         String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
         assertNotNull(pronunciation);
         assertTrue(UnicodeUtil.validUTF16String(pronunciation));
-        
+
         String reading = tid.getReading(wordId, chars, 0, chars.length);
         assertNotNull(reading);
         assertTrue(UnicodeUtil.validUTF16String(reading));
@@ -104,4 +164,5 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
       System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
     }
   }
+
 }
diff --git a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
similarity index 100%
rename from lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
rename to lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java
deleted file mode 100644
index e30d555..0000000
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsBuilder.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ja.util;
-
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.nio.charset.StandardCharsets;
-
-public class ConnectionCostsBuilder {
-  
-  private ConnectionCostsBuilder() {
-  }
-  
-  public static ConnectionCostsWriter build(String filename) throws IOException {
-    FileInputStream inputStream = new FileInputStream(filename);
-    Charset cs = StandardCharsets.US_ASCII;
-    CharsetDecoder decoder = cs.newDecoder()
-        .onMalformedInput(CodingErrorAction.REPORT)
-        .onUnmappableCharacter(CodingErrorAction.REPORT);
-    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
-    LineNumberReader lineReader = new LineNumberReader(streamReader);
-    
-    String line = lineReader.readLine();
-    String[] dimensions = line.split("\\s+");
-    
-    assert dimensions.length == 2;
-    
-    int forwardSize = Integer.parseInt(dimensions[0]);
-    int backwardSize = Integer.parseInt(dimensions[1]);
-    
-    assert forwardSize > 0 && backwardSize > 0;
-    
-    ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
-    
-    while ((line = lineReader.readLine()) != null) {
-      String[] fields = line.split("\\s+");
-      
-      assert fields.length == 3;
-      
-      int forwardId = Integer.parseInt(fields[0]);
-      int backwardId = Integer.parseInt(fields[1]);
-      int cost = Integer.parseInt(fields[2]);
-      
-      costs.add(forwardId, backwardId, cost);
-    }
-    return costs;
-  }
-}
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
deleted file mode 100644
index ed9868b..0000000
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/DictionaryBuilder.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ja.util;
-
-
-import java.io.File;
-import java.io.IOException;
-
-public class DictionaryBuilder {
-  
-  public enum DictionaryFormat { IPADIC, UNIDIC };
-  
-  private DictionaryBuilder() {
-  }
-  
-  public static void build(DictionaryFormat format,
-      String inputDirname,
-      String outputDirname,
-      String encoding,
-      boolean normalizeEntry) throws IOException {
-    System.out.println("building tokeninfo dict...");
-    TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);    
-    TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
-    tokenInfoDictionary.write(outputDirname);
-    tokenInfoDictionary = null;
-    tokenInfoBuilder = null;
-    System.out.println("done");
-    
-    System.out.print("building unknown word dict...");
-    UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
-    UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
-    unkDictionary.write(outputDirname);
-    unkDictionary = null;
-    unkBuilder = null;
-    System.out.println("done");
-    
-    System.out.print("building connection costs...");
-    ConnectionCostsWriter connectionCosts
-      = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
-    connectionCosts.write(outputDirname);
-    System.out.println("done");
-  }
-  
-  public static void main(String[] args) throws IOException {
-    DictionaryFormat format;
-    if (args[0].equalsIgnoreCase("ipadic")) {
-      format = DictionaryFormat.IPADIC;
-    } else if (args[0].equalsIgnoreCase("unidic")) {
-      format = DictionaryFormat.UNIDIC;
-    } else {
-      System.err.println("Illegal format " + args[0] + " using unidic instead");
-      format = DictionaryFormat.IPADIC;
-    }
-    
-    String inputDirname = args[1];
-    String outputDirname = args[2];
-    String inputEncoding = args[3];
-    boolean normalizeEntries = Boolean.parseBoolean(args[4]);
-    
-    System.out.println("dictionary builder");
-    System.out.println("");
-    System.out.println("dictionary format: " + format);
-    System.out.println("input directory: " + inputDirname);
-    System.out.println("output directory: " + outputDirname);
-    System.out.println("input encoding: " + inputEncoding);
-    System.out.println("normalize entries: " + normalizeEntries);
-    System.out.println("");
-    DictionaryBuilder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
-  }
-  
-}
diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
deleted file mode 100644
index f4b7e13..0000000
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/UnknownDictionaryBuilder.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ja.util;
-
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.LineNumberReader;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
-
-public class UnknownDictionaryBuilder {
-  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
-  
-  private String encoding = "euc-jp";
-  
-  public UnknownDictionaryBuilder(String encoding) {
-    this.encoding = encoding;
-  }
-  
-  public UnknownDictionaryWriter build(String dirname) throws IOException {
-    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
-    readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
-    return unkDictionary;
-  }
-  
-  public UnknownDictionaryWriter readDictionaryFile(String filename)
-      throws IOException {
-    return readDictionaryFile(filename, encoding);
-  }
-  
-  public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
-      throws IOException {
-    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
-    
-    FileInputStream inputStream = new FileInputStream(filename);
-    Charset cs = Charset.forName(encoding);
-    CharsetDecoder decoder = cs.newDecoder()
-        .onMalformedInput(CodingErrorAction.REPORT)
-        .onUnmappableCharacter(CodingErrorAction.REPORT);
-    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
-    LineNumberReader lineReader = new LineNumberReader(streamReader);
-    
-    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
-    
-    List<String[]> lines = new ArrayList<>();
-    String line = null;
-    while ((line = lineReader.readLine()) != null) {
-      // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
-      // even though the unknown dictionary returns hardcoded null here.
-      final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
-      lines.add(parsed);
-    }
-    
-    Collections.sort(lines, new Comparator<String[]>() {
-      public int compare(String[] left, String[] right) {
-        int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
-        int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
-        return leftId - rightId;
-      }
-    });
-    
-    for (String[] entry : lines) {
-      dictionary.put(entry);
-    }
-    
-    return dictionary;
-  }
-  
-  public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
-    FileInputStream inputStream = new FileInputStream(filename);
-    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
-    LineNumberReader lineReader = new LineNumberReader(streamReader);
-    
-    String line = null;
-    
-    while ((line = lineReader.readLine()) != null) {
-      line = line.replaceAll("^\\s", "");
-      line = line.replaceAll("\\s*#.*", "");
-      line = line.replaceAll("\\s+", " ");
-      
-      // Skip empty line or comment line
-      if(line.length() == 0) {
-        continue;
-      }
-      
-      if(line.startsWith("0x")) {  // Category mapping
-        String[] values = line.split(" ", 2);  // Split only first space
-        
-        if(!values[0].contains("..")) {
-          int cp = Integer.decode(values[0]).intValue();
-          dictionary.putCharacterCategory(cp, values[1]);
-        } else {
-          String[] codePoints = values[0].split("\\.\\.");
-          int cpFrom = Integer.decode(codePoints[0]).intValue();
-          int cpTo = Integer.decode(codePoints[1]).intValue();
-          
-          for(int i = cpFrom; i <= cpTo; i++){
-            dictionary.putCharacterCategory(i, values[1]);
-          }
-        }
-      } else {  // Invoke definition
-        String[] values = line.split(" "); // Consecutive space is merged above
-        String characterClassName = values[0];
-        int invoke = Integer.parseInt(values[1]);
-        int group = Integer.parseInt(values[2]);
-        int length = Integer.parseInt(values[3]);
-        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
-      }
-    }
-  }
-}
diff --git a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
deleted file mode 100644
index 0f7609f..0000000
--- a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.analysis.ja.dict;
-
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.nio.file.Files;
-import java.nio.file.Path;
-
-import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
-import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryBuilder;
-import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryWriter;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.LuceneTestCase;
-
-import static java.io.File.separatorChar;
-import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
-
-/**
- * Tests of TokenInfoDictionary build tools; run using ant test-tools
- */
-public class TokenInfoDictionaryTest extends LuceneTestCase {
-
-  public void testPut() throws Exception {
-    TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
-                                               // "large" id
-                                               "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
-    IntsRef wordIdRef = new IntsRefBuilder().get();
-
-    dict.lookupWordIds(0, wordIdRef);
-    int wordId = wordIdRef.ints[wordIdRef.offset];
-    assertEquals(5000, dict.getLeftId(wordId));
-    assertEquals(5000, dict.getRightId(wordId));
-    assertEquals(3, dict.getWordCost(wordId));
-
-    dict.lookupWordIds(1, wordIdRef);
-    wordId = wordIdRef.ints[wordIdRef.offset];
-    assertEquals(1, dict.getLeftId(wordId));
-    assertEquals(1, dict.getRightId(wordId));
-    assertEquals(2, dict.getWordCost(wordId));
-  }
-
-  private TokenInfoDictionary newDictionary(String... entries) throws Exception {
-    Path dir = createTempDir();
-    try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
-         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
-      for (String entry : entries) {
-        printer.println(entry);
-      }
-    }
-    TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder(DictionaryFormat.IPADIC, "utf-8", true);
-    TokenInfoDictionaryWriter writer = builder.build(dir.toString());
-    writer.write(dir.toString());
-    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
-    // We must also load the other files (in BinaryDictionary) from the correct path
-    return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
-  }
-
-  public void testPutException() throws Exception {
-    // too few columns
-    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
-    // left id != right id
-    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
-    // left id != right id
-    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
-    // id too large
-    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
-  }
-}