You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/04/16 17:21:07 UTC
[27/46] lucene-solr:jira/solr-11833: LUCENE-8231: Add a new analysis module (nori) similar to Kuromoji but to handle Korean

LUCENE-8231: Add a new analysis module (nori) similar to Kuromoji but to handle Korean

This change adds a korean analyzer in a new analysis module named nori. It is similar
to Kuromoji but uses the mecab-ko-dic dictionary to perform morphological analysis of Korean
text.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e851b89c
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e851b89c
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e851b89c

Branch: refs/heads/jira/solr-11833
Commit: e851b89cbeb1f55edc0f2c1276e2ae812eca2643
Parents: d5b6857
Author: Jim Ferenczi <ji...@apache.org>
Authored: Fri Apr 13 11:26:42 2018 +0200
Committer: Jim Ferenczi <ji...@apache.org>
Committed: Fri Apr 13 11:26:42 2018 +0200

----------------------------------------------------------------------
 dev-tools/idea/.idea/modules.xml                |   1 +
 dev-tools/idea/lucene/analysis/nori/nori.iml    |  22 +
 .../maven/lucene/analysis/nori/pom.xml.template |  75 ++
 .../maven/lucene/analysis/pom.xml.template      |   1 +
 lucene/CHANGES.txt                              |   6 +
 lucene/NOTICE.txt                               |  12 +
 lucene/analysis/README.txt                      |   5 +
 lucene/analysis/build.xml                       |   7 +-
 lucene/analysis/nori/build.xml                  | 135 +++
 lucene/analysis/nori/ivy.xml                    |  33 +
 .../lucene/analysis/ko/DecompoundToken.java     |  69 ++
 .../lucene/analysis/ko/DictionaryToken.java     | 100 ++
 .../lucene/analysis/ko/GraphvizFormatter.java   | 180 ++++
 .../lucene/analysis/ko/KoreanAnalyzer.java      |  78 ++
 .../ko/KoreanPartOfSpeechStopFilter.java        |  85 ++
 .../ko/KoreanPartOfSpeechStopFilterFactory.java |  51 +
 .../analysis/ko/KoreanReadingFormFilter.java    |  51 +
 .../ko/KoreanReadingFormFilterFactory.java      |  42 +
 .../lucene/analysis/ko/KoreanTokenizer.java     | 957 +++++++++++++++++++
 .../analysis/ko/KoreanTokenizerFactory.java     |  89 ++
 .../java/org/apache/lucene/analysis/ko/POS.java | 304 ++++++
 .../org/apache/lucene/analysis/ko/Token.java    | 125 +++
 .../analysis/ko/dict/BinaryDictionary.java      | 239 +++++
 .../analysis/ko/dict/CharacterDefinition.java   | 136 +++
 .../analysis/ko/dict/ConnectionCosts.java       |  96 ++
 .../lucene/analysis/ko/dict/Dictionary.java     |  83 ++
 .../analysis/ko/dict/TokenInfoDictionary.java   |  77 ++
 .../lucene/analysis/ko/dict/TokenInfoFST.java   |  85 ++
 .../analysis/ko/dict/UnknownDictionary.java     |  61 ++
 .../lucene/analysis/ko/dict/UserDictionary.java | 235 +++++
 .../lucene/analysis/ko/dict/package-info.java   |  21 +
 .../apache/lucene/analysis/ko/package-info.java |  21 +
 .../tokenattributes/PartOfSpeechAttribute.java  |  54 ++
 .../PartOfSpeechAttributeImpl.java              |  92 ++
 .../ko/tokenattributes/ReadingAttribute.java    |  38 +
 .../tokenattributes/ReadingAttributeImpl.java   |  55 ++
 .../ko/tokenattributes/package-info.java        |  21 +
 .../apache/lucene/analysis/ko/util/CSVUtil.java |  95 ++
 .../lucene/analysis/ko/util/package-info.java   |  21 +
 lucene/analysis/nori/src/java/overview.html     |  34 +
 ...ache.lucene.analysis.util.TokenFilterFactory |  16 +
 ...apache.lucene.analysis.util.TokenizerFactory |  16 +
 .../analysis/ko/dict/CharacterDefinition.dat    | Bin 0 -> 65564 bytes
 .../lucene/analysis/ko/dict/ConnectionCosts.dat | Bin 0 -> 11178837 bytes
 .../ko/dict/TokenInfoDictionary$buffer.dat      | Bin 0 -> 7245625 bytes
 .../ko/dict/TokenInfoDictionary$fst.dat         | Bin 0 -> 5640925 bytes
 .../ko/dict/TokenInfoDictionary$posDict.dat     | Bin 0 -> 2712 bytes
 .../ko/dict/TokenInfoDictionary$targetMap.dat   | Bin 0 -> 811783 bytes
 .../ko/dict/UnknownDictionary$buffer.dat        | Bin 0 -> 101 bytes
 .../ko/dict/UnknownDictionary$posDict.dat       | Bin 0 -> 1823 bytes
 .../ko/dict/UnknownDictionary$targetMap.dat     | Bin 0 -> 36 bytes
 .../analysis/ko/StringMockResourceLoader.java   |  58 ++
 .../lucene/analysis/ko/TestKoreanAnalyzer.java  | 109 +++
 ...TestKoreanPartOfSpeechStopFilterFactory.java |  59 ++
 .../ko/TestKoreanReadingFormFilter.java         |  75 ++
 .../ko/TestKoreanReadingFormFilterFactory.java  |  51 +
 .../lucene/analysis/ko/TestKoreanTokenizer.java | 355 +++++++
 .../analysis/ko/TestKoreanTokenizerFactory.java | 113 +++
 .../ko/dict/TestTokenInfoDictionary.java        | 113 +++
 .../analysis/ko/dict/UserDictionaryTest.java    |  62 ++
 .../org/apache/lucene/analysis/ko/userdict.txt  |   5 +
 .../ko/util/BinaryDictionaryWriter.java         | 282 ++++++
 .../ko/util/CharacterDefinitionWriter.java      |  94 ++
 .../ko/util/ConnectionCostsBuilder.java         |  67 ++
 .../analysis/ko/util/ConnectionCostsWriter.java |  75 ++
 .../analysis/ko/util/DictionaryBuilder.java     |  67 ++
 .../ko/util/TokenInfoDictionaryBuilder.java     | 150 +++
 .../ko/util/TokenInfoDictionaryWriter.java      |  49 +
 .../ko/util/UnknownDictionaryBuilder.java       | 134 +++
 .../ko/util/UnknownDictionaryWriter.java        |  65 ++
 .../analysis/ko/dict/UnknownDictionaryTest.java |  74 ++
 lucene/ivy-versions.properties                  |   1 +
 lucene/module-build.xml                         |  22 +
 73 files changed, 6003 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/dev-tools/idea/.idea/modules.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml
index 207a675..f77c34a 100644
--- a/dev-tools/idea/.idea/modules.xml
+++ b/dev-tools/idea/.idea/modules.xml
@@ -13,6 +13,7 @@
 
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/common/analysis-common.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/icu/icu.iml" />
+      <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/nori/nori.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/kuromoji/kuromoji.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/morfologik/morfologik.iml" />
       <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/opennlp/opennlp.iml" />

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/dev-tools/idea/lucene/analysis/nori/nori.iml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/lucene/analysis/nori/nori.iml b/dev-tools/idea/lucene/analysis/nori/nori.iml
new file mode 100644
index 0000000..aa2d18e
--- /dev/null
+++ b/dev-tools/idea/lucene/analysis/nori/nori.iml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="false">
+    <output url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/godori/classes/java" />
+    <output-test url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/godori/classes/test" />
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/resources" type="java-resource" />
+      <sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/src/tools/java" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/src/tools/test" isTestSource="true" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="ICU library" level="project" />
+    <orderEntry type="library" scope="TEST" name="JUnit" level="project" />
+    <orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
+    <orderEntry type="module" module-name="analysis-common" />
+    <orderEntry type="module" module-name="lucene-core" />
+  </component>
+</module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/dev-tools/maven/lucene/analysis/nori/pom.xml.template
----------------------------------------------------------------------
diff --git a/dev-tools/maven/lucene/analysis/nori/pom.xml.template b/dev-tools/maven/lucene/analysis/nori/pom.xml.template
new file mode 100644
index 0000000..ac37a08
--- /dev/null
+++ b/dev-tools/maven/lucene/analysis/nori/pom.xml.template
@@ -0,0 +1,75 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-parent</artifactId>
+    <version>@version@</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-analyzers-nori</artifactId>
+  <packaging>jar</packaging>
+  <name>Lucene Nori Korean Morphological Analyzer</name>
+  <description>
+    Lucene Nori Korean Morphological Analyzer
+  </description>
+  <properties>
+    <module-directory>lucene/analysis/nori</module-directory>
+    <relative-top-level>../../../..</relative-top-level>
+    <module-path>${relative-top-level}/${module-directory}</module-path>
+  </properties>
+  <scm>
+    <connection>scm:git:${vc-anonymous-base-url}</connection>
+    <developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
+    <url>${vc-browse-base-url};f=${module-directory}</url>
+  </scm>
+  <dependencies>
+    <dependency> 
+      <!-- lucene-test-framework dependency must be declared before lucene-core -->
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-test-framework</artifactId>
+      <scope>test</scope>
+    </dependency>
+@lucene-analyzers-nori.internal.dependencies@
+@lucene-analyzers-nori.external.dependencies@
+@lucene-analyzers-nori.internal.test.dependencies@
+@lucene-analyzers-nori.external.test.dependencies@
+  </dependencies>
+  <build>
+    <sourceDirectory>${module-path}/src/java</sourceDirectory>
+    <testSourceDirectory>${module-path}/src/test</testSourceDirectory>
+    <resources>
+      <resource>
+        <directory>${module-path}/src/resources</directory>
+      </resource>
+    </resources>
+    <testResources>
+      <testResource>
+        <directory>${project.build.testSourceDirectory}</directory>
+        <excludes>
+          <exclude>**/*.java</exclude>
+        </excludes>
+      </testResource>
+    </testResources>
+  </build>
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/dev-tools/maven/lucene/analysis/pom.xml.template
----------------------------------------------------------------------
diff --git a/dev-tools/maven/lucene/analysis/pom.xml.template b/dev-tools/maven/lucene/analysis/pom.xml.template
index 466ad30..dada0d5 100644
--- a/dev-tools/maven/lucene/analysis/pom.xml.template
+++ b/dev-tools/maven/lucene/analysis/pom.xml.template
@@ -35,6 +35,7 @@
     <module>icu</module>
     <module>kuromoji</module>
     <module>morfologik</module>
+    <module>nori</module>
     <module>opennlp</module>
     <module>phonetic</module>
     <module>smartcn</module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 9016a50..a0e339e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -41,6 +41,12 @@ API Changes
 * LUCENE-8242: Deprecated method IndexSearcher#createNormalizedWeight() has
   been removed (Alan Woodward)
 
+New Features
+
+* LUCENE-8231: A new analysis module (nori) similar to Kuromoji
+  but to handle Korean using mecab-ko-dic and morphological analysis.
+  (Robert Muir, Jim Ferenczi)
+
 Changes in Runtime Behavior
 
 * LUCENE-7837: Indices that were created before the previous major version

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/NOTICE.txt
----------------------------------------------------------------------
diff --git a/lucene/NOTICE.txt b/lucene/NOTICE.txt
index e25c211..4970d20 100644
--- a/lucene/NOTICE.txt
+++ b/lucene/NOTICE.txt
@@ -190,3 +190,15 @@ grants independently of ICOT any specific warranty to the user in
 writing, such person, organization or entity, will also be exempted
 from and not be held liable to the user for any such damages as noted
 above as far as the program is concerned.
+
+===========================================================================
+Nori Korean Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+  mecab-ko-dic-2.0.3-20170922
+
+which can be obtained from
+
+  https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/README.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/README.txt b/lucene/analysis/README.txt
index c68584e..8a9c8d9 100644
--- a/lucene/analysis/README.txt
+++ b/lucene/analysis/README.txt
@@ -28,6 +28,9 @@ lucene-analyzers-kuromoji-XX.jar
 lucene-analyzers-morfologik-XX.jar
   An analyzer using the Morfologik stemming library.
 
+lucene-analyzers-nori-XX.jar
+  An analyzer with morphological analysis for Korean.
+
 lucene-analyzers-opennlp-XX.jar
   An analyzer using the OpenNLP natural-language processing library.
 
@@ -52,6 +55,7 @@ common/src/java
 icu/src/java
 kuromoji/src/java
 morfologik/src/java
+nori/src/java
 opennlp/src/java
 phonetic/src/java
 smartcn/src/java
@@ -63,6 +67,7 @@ common/src/test
 icu/src/test
 kuromoji/src/test
 morfologik/src/test
+nori/src/test
 opennlp/src/test
 phonetic/src/test
 smartcn/src/test

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/build.xml b/lucene/analysis/build.xml
index ed1566c..95dd7dc 100644
--- a/lucene/analysis/build.xml
+++ b/lucene/analysis/build.xml
@@ -25,6 +25,7 @@
       - icu: Analyzers that use functionality from ICU
       - kuromoji: Japanese Morphological Analyzer
       - morfologik: Morfologik Stemmer
+      - nori: Korean Morphological Analyzer
       - smartcn: Smart Analyzer for Simplified Chinese Text
       - stempel: Algorithmic Stemmer for Polish
       - uima: UIMA Analysis module
@@ -65,6 +66,10 @@
     <ant dir="morfologik" />
   </target>
 
+  <target name="nori">
+    <ant dir="nori" />
+  </target>
+
   <target name="opennlp">
     <ant dir="opennlp" />
   </target>
@@ -86,7 +91,7 @@
   </target>
 
   <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,kuromoji,morfologik,opennlp,phonetic,smartcn,stempel,uima" />
+  <target name="compile" depends="common,icu,kuromoji,morfologik,nori,opennlp,phonetic,smartcn,stempel,uima" />
 
   <target name="clean">
     <forall-analyzers target="clean"/>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/build.xml b/lucene/analysis/nori/build.xml
new file mode 100644
index 0000000..dacf3a9
--- /dev/null
+++ b/lucene/analysis/nori/build.xml
@@ -0,0 +1,135 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="analyzers-nori" default="default" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+  <description>
+    Korean Morphological Analyzer
+  </description>
+
+  <!-- currently whether rat detects this as binary or not
+       is platform dependent?! -->
+  <property name="rat.excludes" value="**/*.txt,**/bocchan.utf-8"/>
+  <property name="rat.additional-includes" value="src/tools/**"/>
+
+  <!-- we don't want to pull in ipadic/naist etc -->
+  <property name="ivy.default.configuration" value="default"/>
+  <import file="../analysis-module-build.xml"/>
+
+  <!-- default configuration for Korean: uses mecab-ko-dic -->
+  <property name="dict.type" value="mecab-ko-dic"/>
+  <property name="dict.version" value="mecab-ko-dic-2.0.3-20170922" />
+
+  <property name="dict.src.file" value="${dict.version}.tar.gz" />
+  <property name="dict.src.dir" value="${build.dir}/${dict.version}" />
+  <property name="dict.encoding" value="utf-8"/>
+  <property name="dict.normalize" value="false"/>
+  <property name="dict.target.dir" location="${resources.dir}"/>
+
+  <available type="dir" file="${build.dir}/${dict.version}" property="mecab-ko.dict.available"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+  <target name="download-dict" depends="ivy-availability-check,ivy-fail,ivy-configure" unless="mecab-ko.dict.available">
+    <ivy:retrieve pattern="${build.dir}/${dict.src.file}" conf="${dict.type}" symlink="${ivy.symlink}"/>
+    <!-- TODO: we should checksum too -->
+    <gunzip src="${build.dir}/${dict.src.file}"/>
+    <untar src="${build.dir}/${dict.version}.tar" dest="${build.dir}"/>
+  </target>
+
+  <path id="tools.dependencies">
+    <fileset dir="../icu/lib"/>
+  </path>
+
+  <path id="tools.classpath">
+    <path refid="classpath"/>
+    <path refid="tools.dependencies"/>
+    <pathelement location="${build.dir}/classes/java"/>
+    <pathelement location="${build.dir}/classes/tools"/>
+  </path>
+
+  <path id="tools.test.classpath">
+    <path refid="tools.classpath"/>
+    <path refid="test.base.classpath"/>
+    <pathelement location="${build.dir}/classes/tools-test"/>
+  </path>
+
+  <target name="build-dict" depends="compile-tools, download-dict">
+    <sequential>
+      <delete verbose="true">
+        <fileset dir="${resources.dir}/org/apache/lucene/analysis/ko/dict" includes="**/*"/>
+      </delete>
+      <!-- TODO: optimize the dictionary construction a bit so that you don't need 1G -->
+      <java fork="true" failonerror="true" maxmemory="1g" classname="org.apache.lucene.analysis.ko.util.DictionaryBuilder">
+        <classpath>
+          <path refid="tools.classpath"/>
+        </classpath>
+        <assertions>
+          <enable package="org.apache.lucene"/>
+        </assertions>
+        <arg value="${dict.src.dir}"/>
+        <arg value="${dict.target.dir}"/>
+        <arg value="${dict.encoding}"/>
+        <arg value="${dict.normalize}"/>
+      </java>
+    </sequential>
+  </target>
+
+  <!-- we don't actually need to compile this thing, we just want its lib -->
+  <target name="resolve-icu">
+    <ant dir="../icu/" target="resolve" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
+  </target>
+
+  <target name="compile-tools" depends="resolve-icu, compile-core, common.compile-tools">
+    <compile
+        srcdir="src/tools/java"
+        destdir="${build.dir}/classes/tools">
+      <classpath>
+        <path refid="tools.classpath"/>
+      </classpath>
+    </compile>
+  </target>
+
+  <target name="compile-tools-tests" depends="compile-tools">
+    <compile
+        srcdir="src/tools/test"
+        destdir="${build.dir}/classes/tools-test">
+      <classpath>
+        <path refid="tools.test.classpath"/>
+        <pathelement path="src/tools/test"/>
+      </classpath>
+    </compile>
+  </target>
+
+  <target name="test-tools" depends="compile-tools-tests">
+    <test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
+  </target>
+
+  <target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
+  <!-- TODO: not until we properly make 'test-tools' work with clover etc
+  <target name="test" depends="module-build.test, test-tools"/> -->
+
+  <target name="regenerate" depends="build-dict"/>
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/ivy.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/ivy.xml b/lucene/analysis/nori/ivy.xml
new file mode 100644
index 0000000..8d32937
--- /dev/null
+++ b/lucene/analysis/nori/ivy.xml
@@ -0,0 +1,33 @@
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+-->
+<ivy-module version="2.0">
+  <info organisation="org.apache.lucene" module="analyzers-nori"/>
+
+  <configurations defaultconfmapping="mecab-ko-dic->default"> <!-- 'master' conf not available to map to -->
+    <conf name="default" description="explicitly declare this configuration in order to not download dictionaries unless explicitly called for"/>
+    <conf name="mecab-ko-dic" description="mecab-ko dictionary for Korean" transitive="false"/>
+  </configurations>
+
+  <dependencies>
+    <dependency org="mecab" name="mecab-ko-dic" rev="${/mecab/mecab-ko-dic}" conf="mecab-ko-dic">
+      <artifact name="mecab-ko-dic" type=".tar.gz" url="https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz" />
+    </dependency>
+    <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
+  </dependencies>
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java
new file mode 100644
index 0000000..a44a2d8
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DecompoundToken.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import org.apache.lucene.analysis.ko.dict.Dictionary;
+
+/**
+ * A token that was generated from a compound.
+ */
+public class DecompoundToken extends Token {
+  private final POS.Tag posTag;
+
+  /**
+   *  Creates a new DecompoundToken
+   * @param posTag The part of speech of the token.
+   * @param surfaceForm The surface form of the token.
+   * @param startOffset The start offset of the token in the analyzed text.
+   * @param endOffset The end offset of the token in the analyzed text.
+   */
+  public DecompoundToken(POS.Tag posTag, String surfaceForm, int startOffset, int endOffset) {
+    super(surfaceForm.toCharArray(), 0, surfaceForm.length(), startOffset, endOffset);
+    this.posTag = posTag;
+  }
+
+  @Override
+  public String toString() {
+    return "DecompoundToken(\"" + getSurfaceFormString() + "\" pos=" + getStartOffset() + " length=" + getLength() +
+        " startOffset=" + getStartOffset() + " endOffset=" + getEndOffset() + ")";
+  }
+
+  @Override
+  public POS.Type getPOSType() {
+    return POS.Type.MORPHEME;
+  }
+
+  @Override
+  public POS.Tag getLeftPOS() {
+    return posTag;
+  }
+
+  @Override
+  public POS.Tag getRightPOS() {
+    return posTag;
+  }
+
+  @Override
+  public String getReading() {
+    return null;
+  }
+
+  @Override
+  public Dictionary.Morpheme[] getMorphemes() {
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java
new file mode 100644
index 0000000..3efb119
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/DictionaryToken.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import org.apache.lucene.analysis.ko.dict.Dictionary;
+
+/**
+ * A token stored in a {@link Dictionary}.
+ */
+public class DictionaryToken extends Token {
+  private final int wordId;
+  private final KoreanTokenizer.Type type;
+  private final Dictionary dictionary;
+
+  public DictionaryToken(KoreanTokenizer.Type type, Dictionary dictionary, int wordId, char[] surfaceForm,
+                         int offset, int length, int startOffset, int endOffset) {
+    super(surfaceForm, offset, length, startOffset, endOffset);
+    this.type = type;
+    this.dictionary = dictionary;
+    this.wordId = wordId;
+  }
+
+  @Override
+  public String toString() {
+    return "DictionaryToken(\"" + getSurfaceFormString() + "\" pos=" + getStartOffset() + " length=" + getLength() +
+        " posLen=" + getPositionLength() + " type=" + type + " wordId=" + wordId +
+        " leftID=" + dictionary.getLeftId(wordId) + ")";
+  }
+
+  /**
+   * Returns the type of this token
+   * @return token type, not null
+   */
+  public KoreanTokenizer.Type getType() {
+    return type;
+  }
+
+  /**
+   * Returns true if this token is known word
+   * @return true if this token is in standard dictionary. false if not.
+   */
+  public boolean isKnown() {
+    return type == KoreanTokenizer.Type.KNOWN;
+  }
+
+  /**
+   * Returns true if this token is unknown word
+   * @return true if this token is unknown word. false if not.
+   */
+  public boolean isUnknown() {
+    return type == KoreanTokenizer.Type.UNKNOWN;
+  }
+
+  /**
+   * Returns true if this token is defined in user dictionary
+   * @return true if this token is in user dictionary. false if not.
+   */
+  public boolean isUser() {
+    return type == KoreanTokenizer.Type.USER;
+  }
+
+  @Override
+  public POS.Type getPOSType() {
+    return dictionary.getPOSType(wordId);
+  }
+
+  @Override
+  public POS.Tag getLeftPOS() {
+    return dictionary.getLeftPOS(wordId);
+  }
+
+  @Override
+  public POS.Tag getRightPOS() {
+    return dictionary.getRightPOS(wordId);
+  }
+
+  @Override
+  public String getReading() {
+    return dictionary.getReading(wordId);
+  }
+
+  @Override
+  public Dictionary.Morpheme[] getMorphemes() {
+    return dictionary.getMorphemes(wordId, getSurfaceForm(), getOffset(), getLength());
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java
new file mode 100644
index 0000000..9feb354
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.ko.KoreanTokenizer.Position;
+import org.apache.lucene.analysis.ko.KoreanTokenizer.WrappedPositionArray;
+import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
+import org.apache.lucene.analysis.ko.dict.Dictionary;
+
+
+// TODO: would be nice to show 2nd best path in a diff't
+// color...
+
+/**
+ * Outputs the dot (graphviz) string for the viterbi lattice.
+ */
+public class GraphvizFormatter {
+  
+  private final static String BOS_LABEL = "BOS";
+  
+  private final static String EOS_LABEL = "EOS";
+  
+  private final static String FONT_NAME = "Helvetica";
+  
+  private final ConnectionCosts costs;
+  
+  private final Map<String, String> bestPathMap;
+  
+  private final StringBuilder sb = new StringBuilder();
+  
+  public GraphvizFormatter(ConnectionCosts costs) {
+    this.costs = costs;
+    this.bestPathMap = new HashMap<>();
+    sb.append(formatHeader());
+    sb.append("  init [style=invis]\n");
+    sb.append("  init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
+  }
+
+  public String finish() {
+    sb.append(formatTrailer());
+    return sb.toString();
+  }
+
+  // Backtraces another incremental fragment:
+  void onBacktrace(KoreanTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, boolean isEnd) {
+    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
+    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+    if (isEnd) {
+      sb.append("  fini [style=invis]\n");
+      sb.append("  ");
+      sb.append(getNodeID(endPosData.pos, fromIDX));
+      sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
+    }
+  }
+
+  // Records which arcs make up the best bath:
+  private void setBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
+    bestPathMap.clear();
+
+    int pos = endPosData.pos;
+    int bestIDX = fromIDX;
+    while (pos > startPos) {
+      final Position posData = positions.get(pos);
+
+      final int backPos = posData.backPos[bestIDX];
+      final int backIDX = posData.backIndex[bestIDX];
+
+      final String toNodeID = getNodeID(pos, bestIDX);
+      final String fromNodeID = getNodeID(backPos, backIDX);
+      
+      assert !bestPathMap.containsKey(fromNodeID);
+      assert !bestPathMap.containsValue(toNodeID);
+      bestPathMap.put(fromNodeID, toNodeID);
+      pos = backPos;
+      bestIDX = backIDX;
+    }
+  }
+  
+  private String formatNodes(KoreanTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) {
+
+    StringBuilder sb = new StringBuilder();
+    // Output nodes
+    for (int pos = startPos+1; pos <= endPosData.pos; pos++) {
+      final Position posData = positions.get(pos);
+      for(int idx=0;idx<posData.count;idx++) {
+        sb.append("  ");
+        sb.append(getNodeID(pos, idx));
+        sb.append(" [label=\"");
+        sb.append(pos);
+        sb.append(": ");
+        sb.append(posData.lastRightID[idx]);
+        sb.append("\"]\n");
+      }
+    }
+
+    // Output arcs
+    for (int pos = endPosData.pos; pos > startPos; pos--) {
+      final Position posData = positions.get(pos);
+      for(int idx=0;idx<posData.count;idx++) {
+        final Position backPosData = positions.get(posData.backPos[idx]);
+        final String toNodeID = getNodeID(pos, idx);
+        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
+
+        sb.append("  ");
+        sb.append(fromNodeID);
+        sb.append(" -> ");
+        sb.append(toNodeID);
+
+        final String attrs;
+        if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
+          // This arc is on best path
+          attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
+        } else {
+          attrs = "";
+        }
+
+        final Dictionary dict = tok.getDict(posData.backType[idx]);
+        final int wordCost = dict.getWordCost(posData.backID[idx]);
+        final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
+                                     dict.getLeftId(posData.backID[idx]));
+
+        final String surfaceForm = new String(fragment,
+                                              posData.backPos[idx] - startPos,
+                                              pos - posData.backPos[idx]);
+        
+        sb.append(" [label=\"");
+        sb.append(surfaceForm);
+        sb.append(' ');
+        sb.append(wordCost);
+        if (bgCost >= 0) {
+          sb.append('+');
+        }
+        sb.append(bgCost);
+        sb.append("\"");
+        sb.append(attrs);
+        sb.append("]\n");
+      }
+    }
+    return sb.toString();
+  }
+  
+  private String formatHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("digraph viterbi {\n");
+    sb.append("  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
+    //sb.append("  // A2 paper size\n");
+    //sb.append("  size = \"34.4,16.5\";\n");
+    //sb.append("  // try to fill paper\n");
+    //sb.append("  ratio = fill;\n");
+    sb.append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+    sb.append("  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+    
+    return sb.toString();
+  }
+  
+  private String formatTrailer() {
+    return "}";
+  }
+  
+  private String getNodeID(int pos, int idx) {
+    return pos + "." + idx;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
new file mode 100644
index 0000000..76023bb
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer.DecompoundMode;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+
+import static org.apache.lucene.analysis.TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
+
+/**
+ * Analyzer for Korean that uses morphological analysis.
+ * @see KoreanTokenizer
+ * @lucene.experimental
+ */
+public class KoreanAnalyzer extends Analyzer {
+  private final UserDictionary userDict;
+  private final KoreanTokenizer.DecompoundMode mode;
+  private final Set<POS.Tag> stopTags;
+  private final boolean outputUnknownUnigrams;
+
+  /**
+   * Creates a new KoreanAnalyzer.
+   */
+  public KoreanAnalyzer() {
+    this(null, KoreanTokenizer.DEFAULT_DECOMPOUND, KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, false);
+  }
+
+  /**
+   * Creates a new KoreanAnalyzer.
+   *
+   * @param userDict Optional: if non-null, user dictionary.
+   * @param mode Decompound mode.
+   * @param stopTags The set of part of speech that should be filtered.
+   * @param outputUnknownUnigrams If true outputs unigrams for unknown words.
+   */
+  public KoreanAnalyzer(UserDictionary userDict, DecompoundMode mode, Set<POS.Tag> stopTags, boolean outputUnknownUnigrams) {
+    super();
+    this.userDict = userDict;
+    this.mode = mode;
+    this.stopTags = stopTags;
+    this.outputUnknownUnigrams = outputUnknownUnigrams;
+  }
+  
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer tokenizer = new KoreanTokenizer(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDict, mode, outputUnknownUnigrams);
+    TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
+    stream = new KoreanReadingFormFilter(stream);
+    stream = new LowerCaseFilter(stream);
+    return new TokenStreamComponents(tokenizer, stream);
+  }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new LowerCaseFilter(in);
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
new file mode 100644
index 0000000..4fa7524
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
+
+/**
+ * Removes tokens that match a set of part-of-speech tags.
+ * @lucene.experimental
+ */
+public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
+  private final Set<POS.Tag> stopTags;
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+
+  /**
+   * Default list of tags to filter.
+   */
+  public static final Set<POS.Tag> DEFAULT_STOP_TAGS = Arrays.asList(
+      POS.Tag.E,
+      POS.Tag.IC,
+      POS.Tag.J,
+      POS.Tag.MAG,
+      POS.Tag.MAJ,
+      POS.Tag.MM,
+      POS.Tag.SP,
+      POS.Tag.SSC,
+      POS.Tag.SSO,
+      POS.Tag.SC,
+      POS.Tag.SE,
+      POS.Tag.XPN,
+      POS.Tag.XSA,
+      POS.Tag.XSN,
+      POS.Tag.XSV,
+      POS.Tag.UNA,
+      POS.Tag.NA,
+      POS.Tag.VSV
+  ).stream().collect(Collectors.toSet());
+
+  /**
+   * Create a new {@link KoreanPartOfSpeechStopFilter} with the default
+   * list of stop tags {@link #DEFAULT_STOP_TAGS}.
+   *
+   * @param input    the {@link TokenStream} to consume
+   */
+  public KoreanPartOfSpeechStopFilter(TokenStream input) {
+    this(input, DEFAULT_STOP_TAGS);
+  }
+
+  /**
+   * Create a new {@link KoreanPartOfSpeechStopFilter}.
+   * @param input    the {@link TokenStream} to consume
+   * @param stopTags the part-of-speech tags that should be removed
+   */
+  public KoreanPartOfSpeechStopFilter(TokenStream input, Set<POS.Tag> stopTags) {
+    super(input);
+    this.stopTags = stopTags;
+  }
+
+  @Override
+  protected boolean accept() {
+    final POS.Tag leftPOS = posAtt.getLeftPOS();
+    return leftPOS == null || !stopTags.contains(leftPOS);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilterFactory.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilterFactory.java
new file mode 100644
index 0000000..010abc8
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilterFactory.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link KoreanPartOfSpeechStopFilter}.
+ * @lucene.experimental
+ */
+public class KoreanPartOfSpeechStopFilterFactory extends TokenFilterFactory {
+  private Set<POS.Tag> stopTags;
+
+  /** Creates a new JapanesePartOfSpeechStopFilterFactory */
+  public KoreanPartOfSpeechStopFilterFactory(Map<String,String> args) {
+    super(args);
+    Set<String> stopTagStr = getSet(args, "tags");
+    if (stopTagStr == null) {
+      stopTags = KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+    } else {
+      stopTags = stopTagStr.stream().map(POS::resolveTag).collect(Collectors.toSet());
+    }
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public TokenStream create(TokenStream stream) {
+      return new KoreanPartOfSpeechStopFilter(stream, stopTags);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilter.java
new file mode 100644
index 0000000..8b7e6cb
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilter.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Replaces term text with the {@link ReadingAttribute} which is
+ * the Hangul transcription of Hanja characters.
+ * @lucene.experimental
+ */
+public final class KoreanReadingFormFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+
+  public KoreanReadingFormFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      String reading = readingAtt.getReading();
+      if (reading != null) {
+        termAtt.setEmpty().append(reading);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilterFactory.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilterFactory.java
new file mode 100644
index 0000000..860a139
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanReadingFormFilterFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link KoreanReadingFormFilter}.
+ * @lucene.experimental
+ */
+public class KoreanReadingFormFilterFactory extends TokenFilterFactory {
+
+  /** Creates a new KoreanReadingFilterFactory */
+  public KoreanReadingFormFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new KoreanReadingFormFilter(input);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e851b89c/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
new file mode 100644
index 0000000..822853b
--- /dev/null
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -0,0 +1,957 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ko;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.EnumMap;
+import java.util.List;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
+import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
+import org.apache.lucene.analysis.ko.dict.Dictionary;
+import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
+import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
+import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.RollingCharBuffer;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * Tokenizer for Korean that uses morphological analysis.
+ * <p>
+ * This tokenizer sets a number of additional attributes:
+ * <ul>
+ *   <li>{@link PartOfSpeechAttribute} containing part-of-speech.
+ *   <li>{@link ReadingAttribute} containing reading.
+ * </ul>
+ * <p>
+ * This tokenizer uses a rolling Viterbi search to find the
+ * least cost segmentation (path) of the incoming characters.
+ * @lucene.experimental
+ */
+public final class KoreanTokenizer extends Tokenizer {
+
+  /**
+   * Token type reflecting the original source of this token
+   */
+  public enum Type {
+    /**
+     * Known words from the system dictionary.
+     */
+    KNOWN,
+    /**
+     * Unknown words (heuristically segmented).
+     */
+    UNKNOWN,
+    /**
+     * Known words from the user dictionary.
+     */
+    USER
+  }
+
+  /**
+   * Decompound mode: this determines how the tokenizer handles
+   * {@link POS.Type#COMPOUND}, {@link POS.Type#INFLECT} and {@link POS.Type#PREANALYSIS} tokens.
+   */
+  public enum DecompoundMode {
+    /**
+     * No decomposition for compound.
+     */
+    NONE,
+
+    /**
+     * Decompose compounds and discards the original form (default).
+     */
+    DISCARD,
+
+    /**
+     * Decompose compounds and keeps the original form.
+     */
+    MIXED
+  }
+
+  /**
+   * Default mode for the decompound of tokens ({@link DecompoundMode#DISCARD}.
+   */
+  public static final DecompoundMode DEFAULT_DECOMPOUND = DecompoundMode.DISCARD;
+
+  private static final boolean VERBOSE = false;
+
+  // For safety:
+  private static final int MAX_UNKNOWN_WORD_LENGTH = 1024;
+  private static final int MAX_BACKTRACE_GAP = 1024;
+
+  private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<>(Type.class);
+
+  private final TokenInfoFST fst;
+  private final TokenInfoDictionary dictionary;
+  private final UnknownDictionary unkDictionary;
+  private final ConnectionCosts costs;
+  private final UserDictionary userDictionary;
+  private final CharacterDefinition characterDefinition;
+
+  private final FST.Arc<Long> arc = new FST.Arc<>();
+  private final FST.BytesReader fstReader;
+  private final IntsRef wordIdRef = new IntsRef();
+
+  private final FST.BytesReader userFSTReader;
+  private final TokenInfoFST userFST;
+
+  private final DecompoundMode mode;
+  private final boolean outputUnknownUnigrams;
+
+  private final RollingCharBuffer buffer = new RollingCharBuffer();
+
+  private final WrappedPositionArray positions = new WrappedPositionArray();
+
+  // True once we've hit the EOF from the input reader:
+  private boolean end;
+
+  // Last absolute position we backtraced from:
+  private int lastBackTracePos;
+
+  // Next absolute position to process:
+  private int pos;
+
+  // Already parsed, but not yet passed to caller, tokens:
+  private final List<Token> pending = new ArrayList<>();
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
+  private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
+
+
+  /**
+   * Creates a new KoreanTokenizer with default parameters.
+   * <p>
+   * Uses the default AttributeFactory.
+   */
+  public KoreanTokenizer() {
+    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false);
+  }
+
+  /**
+   * Create a new KoreanTokenizer.
+   *
+   * @param factory the AttributeFactory to use
+   * @param userDictionary Optional: if non-null, user dictionary.
+   * @param mode Decompound mode.
+   * @param outputUnknownUnigrams If true outputs unigrams for unknown words.
+   */
+  public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams) {
+    super(factory);
+    this.mode = mode;
+    this.outputUnknownUnigrams = outputUnknownUnigrams;
+    dictionary = TokenInfoDictionary.getInstance();
+    fst = dictionary.getFST();
+    unkDictionary = UnknownDictionary.getInstance();
+    characterDefinition = unkDictionary.getCharacterDefinition();
+    this.userDictionary = userDictionary;
+    costs = ConnectionCosts.getInstance();
+    fstReader = fst.getBytesReader();
+    if (userDictionary != null) {
+      userFST = userDictionary.getFST();
+      userFSTReader = userFST.getBytesReader();
+    } else {
+      userFST = null;
+      userFSTReader = null;
+    }
+
+    buffer.reset(this.input);
+
+    resetState();
+
+    dictionaryMap.put(Type.KNOWN, dictionary);
+    dictionaryMap.put(Type.UNKNOWN, unkDictionary);
+    dictionaryMap.put(Type.USER, userDictionary);
+  }
+
+  private GraphvizFormatter dotOut;
+
+  /** Expert: set this to produce graphviz (dot) output of
+   *  the Viterbi lattice */
+  public void setGraphvizFormatter(GraphvizFormatter dotOut) {
+    this.dotOut = dotOut;
+  }
+
+  @Override
+  public void close() throws IOException {
+    super.close();
+    buffer.reset(input);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    buffer.reset(input);
+    resetState();
+  }
+
+  private void resetState() {
+    positions.reset();
+    pos = 0;
+    end = false;
+    lastBackTracePos = 0;
+    pending.clear();
+
+    // Add BOS:
+    positions.get(0).add(0, 0, -1, -1, -1, -1, Type.KNOWN);
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    // Set final offset
+    int finalOffset = correctOffset(pos);
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  // Holds all back pointers arriving to this position:
+  final static class Position {
+
+    int pos;
+
+    int count;
+
+    // maybe single int array * 5?
+    int[] costs = new int[8];
+    int[] lastRightID = new int[8];
+    int[] backPos = new int[8];
+    int[] backWordPos = new int[8];
+    int[] backIndex = new int[8];
+    int[] backID = new int[8];
+    Type[] backType = new Type[8];
+
+    public void grow() {
+      costs = ArrayUtil.grow(costs, 1+count);
+      lastRightID = ArrayUtil.grow(lastRightID, 1+count);
+      backPos = ArrayUtil.grow(backPos, 1+count);
+      backWordPos = ArrayUtil.grow(backWordPos, 1+count);
+      backIndex = ArrayUtil.grow(backIndex, 1+count);
+      backID = ArrayUtil.grow(backID, 1+count);
+
+      // NOTE: sneaky: grow separately because
+      // ArrayUtil.grow will otherwise pick a different
+      // length than the int[]s we just grew:
+      final Type[] newBackType = new Type[backID.length];
+      System.arraycopy(backType, 0, newBackType, 0, backType.length);
+      backType = newBackType;
+    }
+
+    public void add(int cost, int lastRightID, int backPos, int backRPos, int backIndex, int backID, Type backType) {
+      // NOTE: this isn't quite a true Viterbi search,
+      // because we should check if lastRightID is
+      // already present here, and only update if the new
+      // cost is less than the current cost, instead of
+      // simply appending.  However, that will likely hurt
+      // performance (usually we add a lastRightID only once),
+      // and it means we actually create the full graph
+      // intersection instead of a "normal" Viterbi lattice:
+      if (count == costs.length) {
+        grow();
+      }
+      this.costs[count] = cost;
+      this.lastRightID[count] = lastRightID;
+      this.backPos[count] = backPos;
+      this.backWordPos[count] = backRPos;
+      this.backIndex[count] = backIndex;
+      this.backID[count] = backID;
+      this.backType[count] = backType;
+      count++;
+    }
+
+    public void reset() {
+      count = 0;
+    }
+  }
+
+  /**
+   * Returns the space penalty associated with the provided {@link POS.Tag}.
+   *
+   * @param leftPOS the left part of speech of the current token.
+   * @param numSpaces the number of spaces before the current token.
+   */
+  private int computeSpacePenalty(POS.Tag leftPOS, int numSpaces) {
+    int spacePenalty = 0;
+    if (numSpaces > 0) {
+      // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
+      switch (leftPOS) {
+        case E:
+        case J:
+        case VCP:
+        case XSA:
+        case XSN:
+        case XSV:
+          spacePenalty = 3000;
+          break;
+
+        default:
+          break;
+      }
+    }
+    return spacePenalty;
+
+  }
+
+  private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) throws IOException {
+    final POS.Tag leftPOS = dict.getLeftPOS(wordID);
+    final int wordCost = dict.getWordCost(wordID);
+    final int leftID = dict.getLeftId(wordID);
+    int leastCost = Integer.MAX_VALUE;
+    int leastIDX = -1;
+    assert fromPosData.count > 0;
+    for(int idx=0;idx<fromPosData.count;idx++) {
+      // The number of spaces before the term
+      int numSpaces = wordPos - fromPosData.pos;
+
+      // Cost is path cost so far, plus word cost (added at
+      // end of loop), plus bigram cost and space penalty cost.
+      final int cost = fromPosData.costs[idx] + costs.get(fromPosData.lastRightID[idx], leftID) + computeSpacePenalty(leftPOS, numSpaces);
+      if (VERBOSE) {
+        System.out.println("      fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.get(fromPosData.lastRightID[idx], leftID) +
+            " spacePenalty=" + computeSpacePenalty(leftPOS, numSpaces) + ") leftID=" + leftID + " leftPOS=" + leftPOS.name() + ")");
+      }
+      if (cost < leastCost) {
+        leastCost = cost;
+        leastIDX = idx;
+        if (VERBOSE) {
+          System.out.println("        **");
+        }
+      }
+    }
+
+    leastCost += wordCost;
+
+    if (VERBOSE) {
+      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count);
+    }
+
+    positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, wordPos, leastIDX, wordID, type);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    // parse() is able to return w/o producing any new
+    // tokens, when the tokens it had produced were entirely
+    // punctuation.  So we loop here until we get a real
+    // token or we end:
+    while (pending.size() == 0) {
+      if (end) {
+        return false;
+      }
+
+      // Push Viterbi forward some more:
+      parse();
+    }
+
+    final Token token = pending.remove(pending.size()-1);
+
+    int length = token.getLength();
+    clearAttributes();
+    assert length > 0;
+    //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length);
+    termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length);
+    offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset()));
+    posAtt.setToken(token);
+    readingAtt.setToken(token);
+    posIncAtt.setPositionIncrement(token.getPositionIncrement());
+    posLengthAtt.setPositionLength(token.getPositionLength());
+    if (VERBOSE) {
+      System.out.println(Thread.currentThread().getName() + ":    incToken: return token=" + token);
+    }
+    return true;
+  }
+
+  // TODO: make generic'd version of this "circular array"?
+  // It's a bit tricky because we do things to the Position
+  // (eg, set .pos = N on reuse)...
+  static final class WrappedPositionArray {
+    private Position[] positions = new Position[8];
+
+    public WrappedPositionArray() {
+      for(int i=0;i<positions.length;i++) {
+        positions[i] = new Position();
+      }
+    }
+
+    // Next array index to write to in positions:
+    private int nextWrite;
+
+    // Next position to write:
+    private int nextPos;
+
+    // How many valid Position instances are held in the
+    // positions array:
+    private int count;
+
+    public void reset() {
+      nextWrite--;
+      while(count > 0) {
+        if (nextWrite == -1) {
+          nextWrite = positions.length - 1;
+        }
+        positions[nextWrite--].reset();
+        count--;
+      }
+      nextWrite = 0;
+      nextPos = 0;
+      count = 0;
+    }
+
+    /** Get Position instance for this absolute position;
+     *  this is allowed to be arbitrarily far "in the
+     *  future" but cannot be before the last freeBefore. */
+    public Position get(int pos) {
+      while(pos >= nextPos) {
+        //System.out.println("count=" + count + " vs len=" + positions.length);
+        if (count == positions.length) {
+          Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          //System.out.println("grow positions " + newPositions.length);
+          System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
+          System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
+          for(int i=positions.length;i<newPositions.length;i++) {
+            newPositions[i] = new Position();
+          }
+          nextWrite = positions.length;
+          positions = newPositions;
+        }
+        if (nextWrite == positions.length) {
+          nextWrite = 0;
+        }
+        // Should have already been reset:
+        assert positions[nextWrite].count == 0;
+        positions[nextWrite++].pos = nextPos++;
+        count++;
+      }
+      assert inBounds(pos);
+      final int index = getIndex(pos);
+      assert positions[index].pos == pos;
+      return positions[index];
+    }
+
+    public int getNextPos() {
+      return nextPos;
+    }
+
+    // For assert:
+    private boolean inBounds(int pos) {
+      return pos < nextPos && pos >= nextPos - count;
+    }
+
+    private int getIndex(int pos) {
+      int index = nextWrite - (nextPos - pos);
+      if (index < 0) {
+        index += positions.length;
+      }
+      return index;
+    }
+
+    public void freeBefore(int pos) {
+      final int toFree = count - (nextPos - pos);
+      assert toFree >= 0;
+      assert toFree <= count;
+      int index = nextWrite - count;
+      if (index < 0) {
+        index += positions.length;
+      }
+      for(int i=0;i<toFree;i++) {
+        if (index == positions.length) {
+          index = 0;
+        }
+        //System.out.println("  fb idx=" + index);
+        positions[index].reset();
+        index++;
+      }
+      count -= toFree;
+    }
+  }
+
+  /* Incrementally parse some more characters.  This runs
+   * the viterbi search forwards "enough" so that we
+   * generate some more tokens.  How much forward depends on
+   * the chars coming in, since some chars could cause
+   * longer-lasting ambiguity in the parsing.  Once the
+   * ambiguity is resolved, then we back trace, produce
+   * the pending tokens, and return. */
+  private void parse() throws IOException {
+    if (VERBOSE) {
+      System.out.println("\nPARSE");
+    }
+
+    // Index of the last character of unknown word:
+    int unknownWordEndIndex = -1;
+
+    // Advances over each position (character):
+    while (true) {
+
+      if (buffer.get(pos) == -1) {
+        // End
+        break;
+      }
+
+      final Position posData = positions.get(pos);
+      final boolean isFrontier = positions.getNextPos() == pos+1;
+
+      if (posData.count == 0) {
+        // No arcs arrive here; move to next position:
+        if (VERBOSE) {
+          System.out.println("    no arcs in; skip pos=" + pos);
+        }
+        pos++;
+        continue;
+      }
+
+      if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
+        //  if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
+        // We are at a "frontier", and only one node is
+        // alive, so whatever the eventual best path is must
+        // come through this node.  So we can safely commit
+        // to the prefix of the best path at this point:
+        backtrace(posData, 0);
+
+        // Re-base cost so we don't risk int overflow:
+        posData.costs[0] = 0;
+        if (pending.size() > 0) {
+          return;
+        } else {
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
+      if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
+        // Safety: if we've buffered too much, force a
+        // backtrace now.  We find the least-cost partial
+        // path, across all paths, backtrace from it, and
+        // then prune all others.  Note that this, in
+        // general, can produce the wrong result, if the
+        // total best path did not in fact back trace
+        // through this partial best path.  But it's the
+        // best we can do... (short of not having a
+        // safety!).
+
+        // First pass: find least cost partial path so far,
+        // including ending at future positions:
+        int leastIDX = -1;
+        int leastCost = Integer.MAX_VALUE;
+        Position leastPosData = null;
+        for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
+          final Position posData2 = positions.get(pos2);
+          for(int idx=0;idx<posData2.count;idx++) {
+            //System.out.println("    idx=" + idx + " cost=" + cost);
+            final int cost = posData2.costs[idx];
+            if (cost < leastCost) {
+              leastCost = cost;
+              leastIDX = idx;
+              leastPosData = posData2;
+            }
+          }
+        }
+
+        // We will always have at least one live path:
+        assert leastIDX != -1;
+
+        // Second pass: prune all but the best path:
+        for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
+          final Position posData2 = positions.get(pos2);
+          if (posData2 != leastPosData) {
+            posData2.reset();
+          } else {
+            if (leastIDX != 0) {
+              posData2.costs[0] = posData2.costs[leastIDX];
+              posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
+              posData2.backPos[0] = posData2.backPos[leastIDX];
+              posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
+              posData2.backIndex[0] = posData2.backIndex[leastIDX];
+              posData2.backID[0] = posData2.backID[leastIDX];
+              posData2.backType[0] = posData2.backType[leastIDX];
+            }
+            posData2.count = 1;
+          }
+        }
+
+        backtrace(leastPosData, 0);
+
+        // Re-base cost so we don't risk int overflow:
+        Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
+
+        if (pos != leastPosData.pos) {
+          // We jumped into a future position:
+          assert pos < leastPosData.pos;
+          pos = leastPosData.pos;
+        }
+        if (pending.size() > 0) {
+          return;
+        } else {
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println("\n  extend @ pos=" + pos + " char=" + (char) buffer.get(pos) + " hex=" + Integer.toHexString(buffer.get(pos)));
+      }
+
+      if (VERBOSE) {
+        System.out.println("    " + posData.count + " arcs in");
+      }
+
+      // Move to the first character that is not a whitespace.
+      // The whitespaces are added as a prefix for the term that we extract,
+      // this information is then used when computing the cost for the term using
+      // the space penalty factor.
+      // They are removed when the final tokens are generated.
+      if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
+        int nextChar = buffer.get(++pos);
+        while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
+          pos ++;
+          nextChar = buffer.get(pos);
+        }
+      }
+      if (buffer.get(pos) == -1) {
+        pos = posData.pos;
+      }
+
+      boolean anyMatches = false;
+
+      // First try user dict:
+      if (userFST != null) {
+        userFST.getFirstArc(arc);
+        int output = 0;
+        for(int posAhead=pos;;posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
+            break;
+          }
+          output += arc.output.intValue();
+          if (arc.isFinal()) {
+            if (VERBOSE) {
+              System.out.println("    USER word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
+            }
+            add(userDictionary, posData, pos, posAhead+1, output + arc.nextFinalOutput.intValue(), Type.USER);
+            anyMatches = true;
+          }
+        }
+      }
+
+      // TODO: we can be more aggressive about user
+      // matches?  if we are "under" a user match then don't
+      // extend KNOWN/UNKNOWN paths?
+
+      if (!anyMatches) {
+        // Next, try known dictionary matches
+        fst.getFirstArc(arc);
+        int output = 0;
+
+        for(int posAhead=pos;;posAhead++) {
+          final int ch = buffer.get(posAhead);
+          if (ch == -1) {
+            break;
+          }
+          //System.out.println("    match " + (char) ch + " posAhead=" + posAhead);
+
+          if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
+            break;
+          }
+
+          output += arc.output.intValue();
+
+          // Optimization: for known words that are too-long
+          // (compound), we should pre-compute the 2nd
+          // best segmentation and store it in the
+          // dictionary instead of recomputing it each time a
+          // match is found.
+
+          if (arc.isFinal()) {
+            dictionary.lookupWordIds(output + arc.nextFinalOutput.intValue(), wordIdRef);
+            if (VERBOSE) {
+              System.out.println("    KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
+            }
+            for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+              add(dictionary, posData, pos, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
+              anyMatches = true;
+            }
+          }
+        }
+      }
+
+      if (unknownWordEndIndex > posData.pos) {
+        pos++;
+        continue;
+      }
+
+      final char firstCharacter = (char) buffer.get(pos);
+      if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) {
+
+        // Find unknown match:
+        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
+        final boolean isPunct = isPunctuation(firstCharacter);
+
+        // NOTE: copied from UnknownDictionary.lookup:
+        int unknownWordLength;
+        if (!characterDefinition.isGroup(firstCharacter)) {
+          unknownWordLength = 1;
+        } else {
+          // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+          unknownWordLength = 1;
+          for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
+            final int ch = buffer.get(posAhead);
+            if (ch == -1) {
+              break;
+            }
+            if (characterId == characterDefinition.getCharacterClass((char) ch) &&
+                isPunctuation((char) ch) == isPunct) {
+              unknownWordLength++;
+            } else {
+              break;
+            }
+          }
+        }
+
+        unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
+        if (VERBOSE) {
+          System.out.println("    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs");
+        }
+        for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+          add(unkDictionary, posData, pos, pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], Type.UNKNOWN);
+        }
+      }
+
+      pos++;
+    }
+
+    end = true;
+
+    if (pos > 0) {
+
+      final Position endPosData = positions.get(pos);
+      int leastCost = Integer.MAX_VALUE;
+      int leastIDX = -1;
+      if (VERBOSE) {
+        System.out.println("  end: " + endPosData.count + " nodes");
+      }
+      for(int idx=0;idx<endPosData.count;idx++) {
+        // Add EOS cost:
+        final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
+        //System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
+        if (cost < leastCost) {
+          leastCost = cost;
+          leastIDX = idx;
+        }
+      }
+
+      backtrace(endPosData, leastIDX);
+    } else {
+      // No characters in the input string; return no tokens!
+    }
+  }
+
+  // the pending list.  The pending list is then in-reverse
+  // (last token should be returned first).
+  private void backtrace(final Position endPosData, final int fromIDX) {
+    final int endPos = endPosData.pos;
+
+    if (VERBOSE) {
+      System.out.println("\n  backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
+    }
+
+    final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos);
+
+    if (dotOut != null) {
+      dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
+    }
+
+    int pos = endPos;
+    int bestIDX = fromIDX;
+
+    // TODO: sort of silly to make Token instances here; the
+    // back trace has all info needed to generate the
+    // token.  So, we could just directly set the attrs,
+    // from the backtrace, in incrementToken w/o ever
+    // creating Token; we'd have to defer calling freeBefore
+    // until after the backtrace was fully "consumed" by
+    // incrementToken.
+
+    while (pos > lastBackTracePos) {
+      //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
+      final Position posData = positions.get(pos);
+      assert bestIDX < posData.count;
+
+      int backPos = posData.backPos[bestIDX];
+      int backWordPos = posData.backWordPos[bestIDX];
+      assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
+      // the length of the word without the whitespaces at the beginning.
+      int length = pos - backWordPos;
+      Type backType = posData.backType[bestIDX];
+      int backID = posData.backID[bestIDX];
+      int nextBestIDX = posData.backIndex[bestIDX];
+      // the start of the word after the whitespace at the beginning.
+      final int fragmentOffset = backWordPos - lastBackTracePos;
+      assert fragmentOffset >= 0;
+
+      final Dictionary dict = getDict(backType);
+
+      if (outputUnknownUnigrams && backType == Type.UNKNOWN) {
+        // outputUnknownUnigrams converts unknown word into unigrams:
+        for (int i = length - 1; i >= 0; i--) {
+          int charLen = 1;
+          if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
+            i--;
+            charLen = 2;
+          }
+          final DictionaryToken token = new DictionaryToken(Type.UNKNOWN,
+              unkDictionary,
+              CharacterDefinition.NGRAM,
+              fragment,
+              fragmentOffset+i,
+              charLen,
+              backWordPos+i,
+              backWordPos+i+charLen
+          );
+          if (shouldFilterToken(token) == false) {
+            pending.add(token);
+            if (VERBOSE) {
+              System.out.println("    add token=" + pending.get(pending.size() - 1));
+            }
+          }
+        }
+      } else {
+        final DictionaryToken token = new DictionaryToken(backType,
+            dict,
+            backID,
+            fragment,
+            fragmentOffset,
+            length,
+            backWordPos,
+            backWordPos + length
+        );
+        if (token.getPOSType() == POS.Type.MORPHEME || mode == DecompoundMode.NONE) {
+          if (shouldFilterToken(token) == false) {
+            pending.add(token);
+            if (VERBOSE) {
+              System.out.println("    add token=" + pending.get(pending.size() - 1));
+            }
+          }
+        } else {
+          Dictionary.Morpheme[] morphemes = token.getMorphemes();
+          if (morphemes == null) {
+            pending.add(token);
+            if (VERBOSE) {
+              System.out.println("    add token=" + pending.get(pending.size() - 1));
+            }
+          } else {
+            int endOffset = backWordPos + length;
+            int posLen = 0;
+            // decompose the compound
+            for (int i = morphemes.length - 1; i >= 0; i--) {
+              final Dictionary.Morpheme morpheme = morphemes[i];
+              final Token compoundToken;
+              if (token.getPOSType() == POS.Type.COMPOUND) {
+                assert endOffset - morpheme.surfaceForm.length() >= 0;
+                compoundToken = new DecompoundToken(morpheme.posTag, morpheme.surfaceForm,
+                    endOffset - morpheme.surfaceForm.length(), endOffset);
+              } else {
+                compoundToken = new DecompoundToken(morpheme.posTag, morpheme.surfaceForm, token.getStartOffset(), token.getEndOffset());
+              }
+              if (i == 0 && mode == DecompoundMode.MIXED) {
+                compoundToken.setPositionIncrement(0);
+              }
+              ++ posLen;
+              endOffset -= morpheme.surfaceForm.length();
+              pending.add(compoundToken);
+              if (VERBOSE) {
+                System.out.println("    add token=" + pending.get(pending.size() - 1));
+              }
+            }
+            if (mode == DecompoundMode.MIXED) {
+              token.setPositionLength(Math.max(1, posLen));
+              pending.add(token);
+              if (VERBOSE) {
+                System.out.println("    add token=" + pending.get(pending.size() - 1));
+              }
+            }
+          }
+        }
+      }
+
+      pos = backPos;
+      bestIDX = nextBestIDX;
+    }
+
+    lastBackTracePos = endPos;
+
+    if (VERBOSE) {
+      System.out.println("  freeBefore pos=" + endPos);
+    }
+    // Notify the circular buffers that we are done with
+    // these positions:
+    buffer.freeBefore(endPos);
+    positions.freeBefore(endPos);
+  }
+
+  Dictionary getDict(Type type) {
+    return dictionaryMap.get(type);
+  }
+
+  private boolean shouldFilterToken(Token token) {
+    return isPunctuation(token.getSurfaceForm()[token.getOffset()]);
+  }
+
+  private static boolean isPunctuation(char ch) {
+    switch(Character.getType(ch)) {
+      case Character.SPACE_SEPARATOR:
+      case Character.LINE_SEPARATOR:
+      case Character.PARAGRAPH_SEPARATOR:
+      case Character.CONTROL:
+      case Character.FORMAT:
+      case Character.DASH_PUNCTUATION:
+      case Character.START_PUNCTUATION:
+      case Character.END_PUNCTUATION:
+      case Character.CONNECTOR_PUNCTUATION:
+      case Character.OTHER_PUNCTUATION:
+      case Character.MATH_SYMBOL:
+      case Character.CURRENCY_SYMBOL:
+      case Character.MODIFIER_SYMBOL:
+      case Character.OTHER_SYMBOL:
+      case Character.INITIAL_QUOTE_PUNCTUATION:
+      case Character.FINAL_QUOTE_PUNCTUATION:
+        return true;
+      default:
+        return false;
+    }
+  }
+}