You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:23:04 UTC

svn commit: r1226632 [1/2] - in /lucene/dev/branches/lucene3305: dev-tools/eclipse/ modules/analysis/ modules/analysis/kuromoji/ modules/analysis/kuromoji/src/ modules/analysis/kuromoji/src/java/ modules/analysis/kuromoji/src/java/org/ modules/analysis...

Author: rmuir
Date: Tue Jan  3 04:22:59 2012
New Revision: 1226632

URL: http://svn.apache.org/viewvc?rev=1226632&view=rev
Log:
LUCENE-3305: current state (dictionary building and low level tests work)

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/cc.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/cd.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/dat.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/unk.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/unk_map.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/bocchan.utf-8   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/char.def.utf-8
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/tokenizer.properties   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/unk.def.utf-8
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt   (with props)
Modified:
    lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath
    lucene/dev/branches/lucene3305/modules/analysis/README.txt
    lucene/dev/branches/lucene3305/modules/analysis/build.xml

Modified: lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath?rev=1226632&r1=1226631&r2=1226632&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath Tue Jan  3 04:22:59 2012
@@ -24,6 +24,9 @@
 	<classpathentry kind="src" path="modules/analysis/icu/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/icu/src/resources"/>
 	<classpathentry kind="src" path="modules/analysis/icu/src/test"/>
+        <classpathentry kind="src" path="modules/analysis/kuromoji/src/java"/>
+        <classpathentry kind="src" path="modules/analysis/kuromoji/src/resources"/>
+        <classpathentry kind="src" path="modules/analysis/kuromoji/src/test"/>
 	<classpathentry kind="src" path="modules/analysis/phonetic/src/java"/>
 	<classpathentry kind="src" path="modules/analysis/phonetic/src/test"/>
 	<classpathentry kind="src" path="modules/analysis/smartcn/src/java"/>

Modified: lucene/dev/branches/lucene3305/modules/analysis/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/README.txt?rev=1226632&r1=1226631&r2=1226632&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/README.txt (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/README.txt Tue Jan  3 04:22:59 2012
@@ -22,6 +22,12 @@ lucene-analyzers-icu-XX.jar
   International Components for Unicode (ICU). Note: this module depends on
   the ICU4j jar file (version >= 4.6.0)
 
+lucene-analyzers-kuromoji-XX.jar
+  An analyzer with morphological analysis for Japanese.
+
+lucene-analyzers-morfologik-XX.jar
+  An analyzer using the Morfologik stemming library.
+
 lucene-analyzers-phonetic-XX.jar
   An add-on analysis library that provides phonetic encoders via Apache
   Commons-Codec. Note: this module depends on the commons-codec jar 
@@ -35,21 +41,20 @@ lucene-analyzers-stempel-XX.jar
   An add-on analysis library that contains a universal algorithmic stemmer,
   including tables for the Polish language.
 
-lucene-analyzers-morfologik-XX.jar
-  An analyzer using the Morfologik stemming library.
-
 common/src/java
 icu/src/java
+kuromoji/src/java
+morfologik/src/java
 phonetic/src/java
 smartcn/src/java
 stempel/src/java
-morfologik/src/java
-  The source code for the ffve libraries.
+  The source code for the libraries.
 
 common/src/test
 icu/src/test
+kuromoji/src/test
+morfologik/src/test
 phonetic/src/test
 smartcn/src/test
 stempel/src/test
-morfologik/src/test
-  Unit tests for the five libraries.
+  Unit tests for the libraries.

Modified: lucene/dev/branches/lucene3305/modules/analysis/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/build.xml?rev=1226632&r1=1226631&r2=1226632&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/build.xml (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/build.xml Tue Jan  3 04:22:59 2012
@@ -23,9 +23,10 @@
     Additional Analyzers
       - common:	Additional Analyzers
       - icu: Analyzers that use functionality from ICU
+      - kuromoji:       Japanese Morphological Analyzer
+      - morfologik:	Morfologik Stemmer
       - smartcn:	Smart Analyzer for Simplified Chinese Text
       - stempel:	Algorithmic Stemmer for Polish
-      - morfologik:	Morfologik Stemmer
   </description>
 
   <target name="common">
@@ -36,6 +37,14 @@
     <ant dir="icu" />
   </target>
 
+  <target name="kuromoji">
+    <ant dir="kuromoji" />
+  </target>
+
+  <target name="morfologik">
+    <ant dir="morfologik" />
+  </target>
+
   <target name="phonetic">
     <ant dir="phonetic" />
   </target>
@@ -48,52 +57,53 @@
     <ant dir="stempel" />
   </target>
 
-  <target name="morfologik">
-    <ant dir="morfologik" />
-  </target>
-
   <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
+  <target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel" />
 
   <target name="clean">
     <ant dir="common" target="clean" />
     <ant dir="icu" target="clean" />
+    <ant dir="kuromoji" target="clean"/>
+    <ant dir="morfologik" target="clean" />
     <ant dir="phonetic" target="clean" />
     <ant dir="smartcn" target="clean" />
     <ant dir="stempel" target="clean" />
-    <ant dir="morfologik" target="clean" />
   </target>
   <target name="validate">
     <ant dir="common" target="validate" />
     <ant dir="icu" target="validate" />
+    <ant dir="kuromoji" target="validate" />
+    <ant dir="morfologik" target="validate" />
     <ant dir="phonetic" target="validate" />
     <ant dir="smartcn" target="validate" />
     <ant dir="stempel" target="validate" />
-    <ant dir="morfologik" target="validate" />
   </target>
   <target name="compile-core">
     <ant dir="common" target="compile-core" />
     <ant dir="icu" target="compile-core" />
+    <ant dir="kuromoji" target="compile-core" />
+    <ant dir="morfologik" target="compile-core" />
     <ant dir="phonetic" target="compile-core" />
     <ant dir="smartcn" target="compile-core" />
     <ant dir="stempel" target="compile-core" />
-    <ant dir="morfologik" target="compile-core" />
   </target>
   <target name="compile-test">
     <ant dir="common" target="compile-test" />
     <ant dir="icu" target="compile-test" />
+    <ant dir="kuromoji" target="compile-test" />
+    <ant dir="morfologik" target="compile-test" />
     <ant dir="phonetic" target="compile-test" />
     <ant dir="smartcn" target="compile-test" />
     <ant dir="stempel" target="compile-test" />
-    <ant dir="morfologik" target="compile-test" />
   </target>
   <target name="test">
     <ant dir="common" target="test" />
     <ant dir="icu" target="test" />
+    <ant dir="kuromoji" target="test" />
+    <ant dir="morfologik" target="test" />
     <ant dir="phonetic" target="test" />
     <ant dir="smartcn" target="test" />
     <ant dir="stempel" target="test" />
-    <ant dir="morfologik" target="test" />
   </target>
 
   <target name="build-artifacts-and-tests" depends="default,compile-test" />
@@ -101,28 +111,31 @@
   <target name="dist-maven" depends="default,javadocs">
     <ant dir="common" target="dist-maven" />
     <ant dir="icu" target="dist-maven" />
+    <ant dir="kuromoji" target="dist-maven" />
+    <ant dir="morfologik" target="dist-maven" />
     <ant dir="phonetic" target="dist-maven" />
     <ant dir="smartcn" target="dist-maven" />
     <ant dir="stempel" target="dist-maven" />
-    <ant dir="morfologik" target="dist-maven" />
   </target>  	
 
   <target name="javadocs">
     <ant dir="common" target="javadocs" />
     <ant dir="icu" target="javadocs" />
+    <ant dir="kuromoji" target="javadocs" />
+    <ant dir="morfologik" target="javadocs" />
     <ant dir="phonetic" target="javadocs" />
     <ant dir="smartcn" target="javadocs" />
     <ant dir="stempel" target="javadocs" />
-    <ant dir="morfologik" target="javadocs" />
   </target>  	
 
   <target name="javadocs-index.html">
     <ant dir="common" target="javadocs-index.html" />
     <ant dir="icu" target="javadocs-index.html" />
+    <ant dir="kuromoji" target="javadocs-index.html" />
+    <ant dir="morfologik" target="javadocs-index.html" />
     <ant dir="phonetic" target="javadocs-index.html" />
     <ant dir="smartcn" target="javadocs-index.html" />
     <ant dir="stempel" target="javadocs-index.html" />
-    <ant dir="morfologik" target="javadocs-index.html" />
   </target>
 	
 </project>

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml Tue Jan  3 04:22:59 2012
@@ -0,0 +1,66 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="analyzers-kuromoji" default="default">
+
+  <description>
+    Kuromoji Japanese Morphological Analyzer
+  </description>
+	
+  <property name="build.dir" location="../build/kuromoji" />
+  <property name="dist.dir" location="../dist/kuromoji" />
+  <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
+  <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
+  <!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
+  <property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
+  <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
+  <property name="dict.encoding" value="euc-jp"/>
+  <property name="dict.format" value="ipadic"/>
+  <property name="dict.normalize" value="true"/>
+  <property name="dict.target.dir" location="./src/resources"/>
+  <import file="../../../lucene/contrib/contrib-build.xml"/> 
+
+  <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+  <target name="download-dict" unless="dict.available">
+     <get src="${dict.url}" dest="${build.dir}/${dict.src.file}"/>
+     <gunzip src="${build.dir}/${dict.src.file}"/>
+     <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
+  </target>
+
+  <target name="build-dict" depends="compile-core, download-dict">
+    <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
+      <classpath>
+        <pathelement path="${classpath}"/>
+        <pathelement path="${build.dir}/classes/java"/>
+      </classpath>
+      <arg value="${dict.format}"/>
+      <arg value="${dict.src.dir}"/>
+      <arg value="${dict.target.dir}"/>
+      <arg value="${dict.encoding}"/>
+      <arg value="${dict.normalize}"/>
+    </java>
+  </target>
+</project>

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,89 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.Tokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionaries;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
+import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
+
+public class DebugTokenizer {
+
+	private GraphvizFormatter formatter;
+	
+	private Viterbi viterbi;
+	
+	protected DebugTokenizer(UserDictionary userDictionary, Mode mode) {
+
+		this.viterbi = new Viterbi(Dictionaries.getTrie(),
+								   Dictionaries.getDictionary(),
+								   Dictionaries.getUnknownDictionary(),
+								   Dictionaries.getCosts(),
+								   userDictionary,
+								   mode);
+		
+		this.formatter = new GraphvizFormatter(Dictionaries.getCosts());
+	}
+	
+	public String debugTokenize(String text) {
+		ViterbiNode[][][] lattice = this.viterbi.build(text);
+		List<ViterbiNode> bestPath = this.viterbi.search(lattice);
+		return this.formatter.format(lattice[0], lattice[1], bestPath);
+	}
+	
+	public static Builder builder() {
+		return new Builder();
+	}
+	
+	public static class Builder {
+
+		private Mode mode = Mode.NORMAL;
+		
+		private UserDictionary userDictionary = null;
+		
+		public synchronized Builder mode(Mode mode) {
+			this.mode = mode;
+			return this;
+		}
+		
+		public synchronized Builder userDictionary(InputStream userDictionaryInputStream)
+			throws IOException {
+			this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+			return this;
+		}
+
+		public synchronized Builder userDictionary(String userDictionaryPath)
+			throws FileNotFoundException, IOException {
+			this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+			return this;
+		}
+		
+		public synchronized DebugTokenizer build() {
+			return new DebugTokenizer(userDictionary, mode);
+		}
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+public class Token {
+	private final Dictionary dictionary;
+
+	private final int wordId;
+	
+	private final String surfaceForm;
+	
+	private final int position;
+	
+	private final Type type;
+	
+	public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
+		this.wordId = wordId;
+		this.surfaceForm = surfaceForm;
+		this.type = type;
+		this.position = position;
+		this.dictionary = dictionary;
+	}
+
+	/**
+	 * @return surfaceForm
+	 */
+	public String getSurfaceForm() {
+		return surfaceForm;
+	}
+
+	/**
+	 * @return all features
+	 */
+	public String getAllFeatures() {
+		return dictionary.getAllFeatures(wordId);
+	}
+
+	/**
+	 * @return all features as array
+	 */
+	public String[] getAllFeaturesArray() {
+		return dictionary.getAllFeaturesArray(wordId);
+	}
+
+
+	/**
+	 * @return reading. null if token doesn't have reading.
+	 */
+	public String getReading() {
+		return dictionary.getReading(wordId);
+	}
+
+	/**
+	 * @return part of speech.
+	 */
+	public String getPartOfSpeech() {
+		return dictionary.getPartOfSpeech(wordId);
+	}
+
+	/**
+	 * Returns true if this token is known word
+	 * @return true if this token is in standard dictionary. false if not.
+	 */
+	public boolean isKnown() {
+		return type == Type.KNOWN;
+	}
+
+	/**
+	 * Returns true if this token is unknown word
+	 * @return true if this token is unknown word. false if not.
+	 */
+	public boolean isUnknown() {
+		return type == Type.UNKNOWN;
+	}
+	
+	/**
+	 * Returns true if this token is defined in user dictionary
+	 * @return true if this token is in user dictionary. false if not.
+	 */
+	public boolean isUser() {
+		return type == Type.USER;
+	}
+	
+	/**
+	 * Get index of this token in input text
+	 * @return position of token
+	 */
+	public int getPosition() {
+		return position;
+	}
+
+}
\ No newline at end of file

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,238 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionaries;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+/**
+ * Tokenizer main class.
+ * Thread safe.
+ */
+public class Tokenizer {
+	public enum Mode {
+		NORMAL, SEARCH, EXTENDED
+	}
+	
+	private final Viterbi viterbi;
+	
+	private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+	
+	private final boolean split;
+	
+	/**
+	 * Constructor
+	 * @param dictionary
+	 * @param costs
+	 * @param trie
+	 * @param unkDictionary
+	 * @param userDictionary
+	 * @param mode
+	 */
+	protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
+
+		this.viterbi = new Viterbi(Dictionaries.getTrie(),
+				                   Dictionaries.getDictionary(),
+				                   Dictionaries.getUnknownDictionary(),
+				                   Dictionaries.getCosts(),
+				                   userDictionary,
+				                   mode);
+
+		this.split = split;
+		
+		dictionaryMap.put(Type.KNOWN, Dictionaries.getDictionary());
+		dictionaryMap.put(Type.UNKNOWN, Dictionaries.getUnknownDictionary());
+		dictionaryMap.put(Type.USER, userDictionary);
+	}
+
+	/**
+	 * Tokenize input text
+	 * @param text
+	 * @return list of Token
+	 */
+	public List<Token> tokenize(String text) {
+
+		if (!split) {
+			return doTokenize(0, text);			
+		}
+		
+		List<Integer> splitPositions = getSplitPositions(text);
+
+		if(splitPositions.size() == 0) {
+			return doTokenize(0, text);
+		}
+		
+		ArrayList<Token> result = new ArrayList<Token>();
+		int offset = 0;
+		for(int position : splitPositions) {
+			result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
+			offset = position + 1;
+		}
+		
+		if(offset < text.length()) {
+			result.addAll(doTokenize(offset, text.substring(offset)));
+		}
+		
+		return result;
+	}
+	
+	/**
+	 * Split input text at 句読点, which is 。 and 、
+	 * @param text
+	 * @return list of split position
+	 */
+	private List<Integer> getSplitPositions(String text) {
+		ArrayList<Integer> splitPositions = new ArrayList<Integer>();
+		
+		int position = 0;
+		int currentPosition = 0;
+
+		while(true) {
+			int indexOfMaru = text.indexOf("。", currentPosition);
+			int indexOfTen = text.indexOf("、", currentPosition);
+			
+			if(indexOfMaru < 0 || indexOfTen < 0) {
+				position = Math.max(indexOfMaru, indexOfTen);;
+			} else {
+				position = Math.min(indexOfMaru, indexOfTen);				
+			}
+			
+			if(position >= 0) {
+				splitPositions.add(position);
+				currentPosition = position + 1;
+			} else {
+				break;
+			}
+		}
+		
+		return splitPositions;
+	}
+	
+	/**
+	 * Tokenize input sentence.
+	 * @param offset offset of sentence in original input text
+	 * @param sentence sentence to tokenize
+	 * @return list of Token
+	 */
+	private List<Token> doTokenize(int offset, String sentence) {
+		ArrayList<Token> result = new ArrayList<Token>();
+		
+		ViterbiNode[][][] lattice = viterbi.build(sentence);
+		List<ViterbiNode> bestPath = viterbi.search(lattice);
+		for (ViterbiNode node : bestPath) {
+			int wordId = node.getWordId();
+			if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS 
+				continue;
+			}
+			Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
+			result.add(token);
+		}
+		
+		return result;
+	}
+	
+	/**
+	 * Get Builder to create Tokenizer instance.
+	 * @return Builder
+	 */
+	public static Builder builder() {
+		return new Builder();
+	}
+	
+	/**
+	 * Builder class used to create Tokenizer instance.
+	 */
+	public static class Builder {
+
+		private Mode mode = Mode.NORMAL;
+		
+		private boolean split = true;
+
+		private UserDictionary userDictionary = null;
+		
+		/**
+		 * Set tokenization mode
+		 * Default: NORMAL
+		 * @param mode tokenization mode
+		 * @return Builder
+		 */
+		public synchronized Builder mode(Mode mode) {
+			this.mode = mode;
+			return this;
+		}
+		
+		/**
+		 * Set if tokenizer should split input string at "。" and "、" before tokenize to increase performance.
+		 * Splitting shouldn't change the result of tokenization most of the cases.
+		 * Default: true
+		 * 
+		 * @param split whether tokenizer should split input string
+		 * @return Builder
+		 */
+		public synchronized Builder split(boolean split) {
+			this.split = split;
+			return this;
+		}
+		
+		/**
+		 * Set user dictionary input stream
+		 * @param userDictionaryInputStream dictionary file as input stream
+		 * @return Builder
+		 * @throws IOException 
+		 */
+		public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
+			this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+			return this;
+		}
+		
+		/**
+		 * Set user dictionary path
+		 * @param userDictionaryPath path to dictionary file
+		 * @return Builder
+		 * @throws IOException 
+		 * @throws FileNotFoundException 
+		 */
+		public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
+			if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
+				this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+			}
+			return this;
+		}
+		
+		/**
+		 * Create Tokenizer instance
+		 * @return Tokenizer
+		 */
+		public synchronized Tokenizer build() {
+			return new Tokenizer(userDictionary, mode, split);
+		}
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+import java.util.EnumMap;
+
+public final class CharacterDefinition implements Serializable {
+	private static final long serialVersionUID = -1436753619176638532L;
+	
+	private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
+
+	private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
+		new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
+
+	public enum CharacterClass {
+		NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
+
+		public int getId() {
+			return ordinal();
+		}
+	}
+
+	/**
+	 * Constructor
+	 */
+	public CharacterDefinition() {
+		for (int i = 0; i < characterCategoryMap.length; i++) {
+			characterCategoryMap[i] = CharacterClass.DEFAULT;
+		}
+	}
+
+	public int lookup(char c) {
+		return characterCategoryMap[c].getId();
+	}
+
+	public CharacterClass getCharacterClass(char c) {
+		return characterCategoryMap[c];
+	}
+
+	public boolean isInvoke(char c) {
+		CharacterClass characterClass = characterCategoryMap[c];
+		int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+		return invokeDefinition[0] == 1;
+	}
+
+	public boolean isGroup(char c) {
+		CharacterClass characterClass = characterCategoryMap[c];
+		int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+		return invokeDefinition[1] == 1;
+	}
+
+	public boolean isKanji(char c) {
+		return characterCategoryMap[c] == CharacterClass.KANJI ||
+			   characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
+	}
+
+	/**
+	 * Put mapping from unicode code point to character class.
+	 * 
+	 * @param codePoint
+	 *            code point
+	 * @param class character class name
+	 */
+	public void putCharacterCategory(int codePoint, String characterClassName) {
+		characterClassName = characterClassName.split(" ")[0]; // use first
+																// category
+																// class
+
+		// Override Nakaguro
+		if (codePoint == 0x30FB) {
+			characterClassName = "SYMBOL";
+		}
+		characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
+	}
+
+	public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+		CharacterClass characterClass = CharacterClass
+				.valueOf(characterClassName);
+		int[] values = { invoke, group, length };
+		invokeDefinitionMap.put(characterClass, values);
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,80 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+
+public class ConnectionCosts implements Serializable{
+
+	private static final long serialVersionUID = -7704592689635266457L;
+
+	public static final String FILENAME = "cc.dat";
+		
+	private short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
+	
+	public ConnectionCosts() {
+		
+	}
+	
+	public ConnectionCosts(int forwardSize, int backwardSize) {
+		this.costs = new short[backwardSize][forwardSize]; 
+	}
+
+	public void add(int forwardId, int backwardId, int cost) {
+		this.costs[backwardId][forwardId] = (short)cost;
+	}
+	
+	public int get(int forwardId, int backwardId) {
+		// FIXME: There seems to be something wrong with the double array trie in some rare
+		// cases causing and IndexOutOfBoundsException.  Use a guard as a temporary work-around
+		// and return a high cost to advise Mr. Viterbi strongly to not use this transition
+		if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
+	    	return costs[backwardId][forwardId];
+	    } else {
+	    	return 50000;
+	    }
+	}
+
+	public void write(String directoryname) throws IOException {
+		String filename = directoryname + File.separator + FILENAME;
+		ObjectOutputStream outputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+		outputStream.writeObject(this);
+		outputStream.close();
+	}
+
+	public static ConnectionCosts getInstance() throws IOException, ClassNotFoundException {
+		InputStream is = ConnectionCosts.class.getClassLoader().getResourceAsStream(FILENAME);
+		return read(is);
+	}
+	
+	public static ConnectionCosts read(InputStream is) throws IOException, ClassNotFoundException {
+		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+		ConnectionCosts instance = (ConnectionCosts) ois.readObject();
+		ois.close();
+		return instance;
+	}
+
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
+
+public final class Dictionaries {
+
+	private static TokenInfoDictionary dictionary;
+
+	private static UnknownDictionary unknownDictionary;
+
+	private static ConnectionCosts costs;
+
+	private static DoubleArrayTrie trie;
+	
+	private static boolean initialized = false;
+	
+	static {
+		load();
+	}
+
+	private static synchronized void load() {
+
+		if (Dictionaries.initialized) {
+			return;
+		}
+
+		try {
+			Dictionaries.dictionary = TokenInfoDictionary.getInstance();
+			Dictionaries.unknownDictionary = UnknownDictionary.getInstance();
+			Dictionaries.costs = ConnectionCosts.getInstance();
+			Dictionaries.trie = DoubleArrayTrie.getInstance();
+			Dictionaries.initialized = true;
+		} catch (Exception ex) {
+			throw new RuntimeException("Could not load dictionaries!  Ouch, ouch, ouch...", ex);
+		}
+	}
+
+	/**
+	 * @return the dictionary
+	 */
+	public static TokenInfoDictionary getDictionary() {
+		return dictionary;
+	}
+
+	/**
+	 * @param dictionary the dictionary to set
+	 */
+	public static void setDictionary(TokenInfoDictionary dictionary) {
+		Dictionaries.dictionary = dictionary;
+	}
+
+	/**
+	 * @return the unknownDictionary
+	 */
+	public static UnknownDictionary getUnknownDictionary() {
+		return unknownDictionary;
+	}
+
+	/**
+	 * @param unknownDictionary the unknownDictionary to set
+	 */
+	public static void setUnknownDictionary(UnknownDictionary unknownDictionary) {
+		Dictionaries.unknownDictionary = unknownDictionary;
+	}
+
+	/**
+	 * @return the costs
+	 */
+	public static ConnectionCosts getCosts() {
+		return costs;
+	}
+
+	/**
+	 * @param costs the costs to set
+	 */
+	public static void setCosts(ConnectionCosts costs) {
+		Dictionaries.costs = costs;
+	}
+
+	/**
+	 * @return the trie
+	 */
+	public static DoubleArrayTrie getTrie() {
+		return trie;
+	}
+
+	/**
+	 * @param trie the trie to set
+	 */
+	public static void setTrie(DoubleArrayTrie trie) {
+		Dictionaries.trie = trie;
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,80 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public interface Dictionary {
+
+	public static final String INTERNAL_SEPARATOR = "\u0000";
+
+	/**
+	 * Get left id of specified word
+	 * @param wordId
+	 * @return	left id
+	 */
+	public int getLeftId(int wordId);
+	
+	/**
+	 * Get right id of specified word
+	 * @param wordId
+	 * @return	left id
+	 */
+	public int getRightId(int wordId);
+	
+	/**
+	 * Get word cost of specified word
+	 * @param wordId
+	 * @return	left id
+	 */
+	public int getWordCost(int wordId);
+
+	/**
+	 * Get all features of tokens
+	 * @param wordId word ID of token
+	 * @return All features of the token
+	 */
+	public String getAllFeatures(int wordId);
+
+	/**
+	 * Get all features as array
+	 * @param wordId word ID of token
+	 * @return Array containing all features of the token
+	 */
+	public String[] getAllFeaturesArray(int wordId);
+
+	/**
+	 * Get Part-Of-Speech of tokens
+	 * @param wordId word ID of token
+	 * @return Part-Of-Speech of the token
+	 */
+	public String getPartOfSpeech(int wordId);
+
+	/**
+	 * Get reading of tokens
+	 * @param wordId word ID of token
+	 * @return Reading of the token
+	 */
+	public String getReading(int wordId);
+	
+	/**
+	 * Get feature(s) of tokens
+	 * @param wordId word ID token
+	 * @param fields array of index. If this is empty, return all features.
+	 * @return Features of the token
+	 */
+	public String getFeature(int wordId, int... fields);
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,244 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import java.nio.channels.WritableByteChannel;
+
+import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
+
+public class TokenInfoDictionary implements Dictionary{
+
+	public static final String FILENAME = "tid.dat";
+
+	public static final String TARGETMAP_FILENAME = "tid_map.dat";
+
+	protected ByteBuffer buffer;
+	
+	protected int[][] targetMap;
+
+	public TokenInfoDictionary() {
+	}
+	
+	public TokenInfoDictionary(int size) {
+		targetMap = new int[1][];
+		buffer = ByteBuffer.allocate(size);
+	}
+
+	/**
+	 * put the entry in map
+	 * @param wordId
+	 * @param entry
+	 * @return current position of buffer, which will be wordId of next entry
+	 */
+	public int put(String[] entry) {
+		short leftId = Short.parseShort(entry[1]);
+		short rightId = Short.parseShort(entry[2]);
+		short wordCost = Short.parseShort(entry[3]);
+
+		StringBuilder sb = new StringBuilder();
+		for (int i = 4; i < entry.length; i++){
+			sb.append(entry[i]).append(INTERNAL_SEPARATOR);
+		}
+		String features = sb.deleteCharAt(sb.length() - 1).toString();
+		int featuresSize = features.length()* 2;
+
+		// extend buffer if necessary
+		int left = buffer.limit() - buffer.position();
+		if (8 + featuresSize > left) { // four short and features
+			ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
+			buffer.flip();
+			newBuffer.put(buffer);
+			buffer = newBuffer;
+		}
+		
+		buffer.putShort(leftId);
+		buffer.putShort(rightId);
+		buffer.putShort(wordCost);
+		buffer.putShort((short)featuresSize);
+		for (char c : features.toCharArray()){
+			buffer.putChar(c);
+		}
+
+		return buffer.position();
+	}
+
+	public void addMapping(int sourceId, int wordId) {
+		if(targetMap.length <= sourceId) {
+			int[][] newArray = new int[sourceId + 1][];
+			System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
+			targetMap = newArray;
+		}
+		
+		// Prepare array -- extend the length of array by one
+		int[] current = targetMap[sourceId];
+		if (current == null) {
+			current = new int[1];
+		} else {
+			int[] newArray = new int[current.length + 1];
+			System.arraycopy(current, 0, newArray, 0, current.length);
+			current = newArray;
+		}
+		targetMap[sourceId] = current;
+
+		int[] targets = targetMap[sourceId];
+		targets[targets.length - 1] = wordId;
+	}
+	
+	public int[] lookupWordIds(int sourceId) {
+		return targetMap[sourceId];
+	}
+	
+	@Override	
+	public int getLeftId(int wordId) {
+		return buffer.getShort(wordId);
+	}
+
+	@Override
+	public int getRightId(int wordId) {
+		return buffer.getShort(wordId + 2);	// Skip left id
+	}
+	
+	@Override
+	public int getWordCost(int wordId) {
+		return buffer.getShort(wordId + 4);	// Skip left id and right id
+	}
+
+	@Override
+	public String[] getAllFeaturesArray(int wordId) {
+		int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
+		char[] targetArr = new char[size];
+		int offset = wordId + 6 + 2; // offset is position where features string starts
+		for(int i = 0; i < size; i++){
+			targetArr[i] = buffer.getChar(offset + i * 2);
+		}
+		String allFeatures = new String(targetArr);
+		return allFeatures.split(INTERNAL_SEPARATOR);
+	}
+	
+	@Override
+	public String getFeature(int wordId, int... fields) {
+		String[] allFeatures = getAllFeaturesArray(wordId);
+		StringBuilder sb = new StringBuilder();
+		
+		if(fields.length == 0){ // All features
+			for(String feature : allFeatures) {
+				sb.append(CSVUtil.quoteEscape(feature)).append(",");
+			}
+		} else if(fields.length == 1) { // One feature doesn't need to escape value
+			sb.append(allFeatures[fields[0]]).append(",");			
+		} else {
+			for(int field : fields){
+				sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+			}
+		}
+		
+		return sb.deleteCharAt(sb.length() - 1).toString();
+	}
+	
+	@Override
+	public String getReading(int wordId) {
+		return getFeature(wordId, 7);
+	}
+
+	@Override
+	public String getAllFeatures(int wordId) {
+		return getFeature(wordId);
+	}
+
+	@Override
+	public String getPartOfSpeech(int wordId) {
+		return getFeature(wordId, 0, 1, 2, 3);
+	}
+	
+
+	/**
+	 * Write dictionary in file
+	 * Dictionary format is:
+	 * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+	 * @param filename
+	 * @throws IOException
+	 */
+	public void write(String directoryname) throws IOException {
+		writeDictionary(directoryname + File.separator + FILENAME);
+		writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+	}
+
+	protected void writeTargetMap(String filename) throws IOException {
+		ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));		
+		oos.writeObject(targetMap);
+		oos.close();
+	}
+	
+	protected void writeDictionary(String filename) throws IOException {
+		FileOutputStream fos = new FileOutputStream(filename);
+		DataOutputStream dos = new DataOutputStream(fos);
+		dos.writeInt(buffer.position());
+		WritableByteChannel channel = Channels.newChannel(fos);
+		// Write Buffer
+		buffer.flip();  // set position to 0, set limit to current position
+		channel.write(buffer);
+		
+		fos.close();
+	}
+	
+	/**
+	 * Read dictionary into directly allocated buffer.
+	 * @return TokenInfoDictionary instance
+	 * @throws IOException
+	 * @throws ClassNotFoundException 
+	 */
+	public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
+		TokenInfoDictionary dictionary = new TokenInfoDictionary();
+		ClassLoader loader = dictionary.getClass().getClassLoader();
+		dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+		dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+		return dictionary;
+	}
+	
+	protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
+		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+		targetMap = (int[][]) ois.readObject();
+		is.close();
+	}
+	
+	protected void loadDictionary(InputStream is) throws IOException {
+		DataInputStream dis = new DataInputStream(is);
+		int size = dis.readInt();
+		
+		ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+
+		ReadableByteChannel channel = Channels.newChannel(is);
+		channel.read(tmpBuffer);
+		is.close();
+		buffer = tmpBuffer.asReadOnlyBuffer();
+	}
+
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,142 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition.CharacterClass;
+
+public class UnknownDictionary extends TokenInfoDictionary {
+
+	public static final String FILENAME = "unk.dat";
+	
+	public static final String TARGETMAP_FILENAME = "unk_map.dat";
+
+	public static final String CHARDEF_FILENAME = "cd.dat";
+
+	private CharacterDefinition characterDefinition;
+	
+	/**
+	 * Constructor
+	 */
+    public UnknownDictionary() {
+    }
+    
+    public UnknownDictionary(int size) {
+    	super(size);
+		characterDefinition = new CharacterDefinition();    	
+    }
+    
+    @Override
+    public int put(String[] entry) {
+    	// Get wordId of current entry
+    	int wordId = buffer.position();
+    	
+    	// Put entry
+		int result = super.put(entry);
+
+		// Put entry in targetMap
+		int characterId = CharacterClass.valueOf(entry[0]).getId();
+		addMapping(characterId, wordId);
+		return result;
+    }
+    
+    public int lookup(String text) {
+    	if(!characterDefinition.isGroup(text.charAt(0))) {
+    		return 1;
+    	}
+    	
+    	// Extract unknown word. Characters with the same character class are considered to be part of unknown word
+    	int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
+    	int length = 1;
+    	for (int i = 1; i < text.length(); i++) {
+    		if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
+        		length++;    			
+    		} else {
+    			break;
+    		}
+    	}
+    	
+    	return length;
+    }
+
+	/**
+	 * Put mapping from unicode code point to character class.
+	 * 
+	 * @param codePoint code point
+	 * @param class character class name
+	 */
+	public void putCharacterCategory(int codePoint, String characterClassName) {
+		characterDefinition.putCharacterCategory(codePoint, characterClassName);
+	}
+	
+	public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+		characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
+	}
+	
+
+	public CharacterDefinition getCharacterDefinition() {
+		return characterDefinition;
+	}
+	
+	/**
+	 * Write dictionary in file
+	 * Dictionary format is:
+	 * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+	 * @param filename
+	 * @throws IOException
+	 */
+	public void write(String directoryname) throws IOException {
+		writeDictionary(directoryname + File.separator + FILENAME);
+		writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+		writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
+	}
+	
+	protected void writeCharDef(String filename) throws IOException {
+		ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));		
+		oos.writeObject(characterDefinition);
+		oos.close();
+	}
+
+	public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
+		UnknownDictionary dictionary = new UnknownDictionary();
+		ClassLoader loader = dictionary.getClass().getClassLoader();
+		dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+		dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+		dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));
+		return dictionary;
+	}
+
+	protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
+		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+		characterDefinition = (CharacterDefinition) ois.readObject();
+		ois.close();
+	}
+	
+	@Override
+	public String getReading(int wordId) {
+		return null;
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,196 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
+
+public class UserDictionary implements Dictionary {
+
+	private TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
+	
+	private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>();
+	
+	private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
+
+	public static final int WORD_COST = -100000;
+
+	public static final int LEFT_ID = 5;
+
+	public static final int RIGHT_ID = 5;
+	
+	public UserDictionary() {
+		
+	}
+
+	/**
+	 * Lookup words in text
+	 * @param text
+	 * @return array of {wordId, position, length}
+	 */
+	public int[][] lookup(String text) {
+		TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
+
+		for (String keyword : entries.descendingKeySet()) {
+			int offset = 0;
+			int position = text.indexOf(keyword, offset);
+			while (offset < text.length() && position >= 0) {
+				if(!result.containsKey(position)){
+					result.put(position, entries.get(keyword));
+				}
+				offset += position + keyword.length();
+				position = text.indexOf(keyword, offset);
+			}
+		}
+
+		return toIndexArray(result);
+	}
+
+	/**
+	 * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
+	 * @param input
+	 * @return array of {wordId, index, length}
+	 */
+	private int[][] toIndexArray(Map<Integer, int[]> input) {
+		ArrayList<int[]> result = new ArrayList<int[]>();
+		for (int i : input.keySet()) {
+			int[] wordIdAndLength = input.get(i);
+			int wordId = wordIdAndLength[0];
+			// convert length to index
+			int current = i;
+			for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
+				int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
+				result.add(token);
+				current += wordIdAndLength[j];
+			}
+		}
+		return result.toArray(new int[result.size()][]);
+	}
+
+	@Override
+	public int getLeftId(int wordId) {
+		return LEFT_ID;
+	}
+
+	@Override
+	public int getRightId(int wordId) {
+		return RIGHT_ID;
+	}
+
+	@Override
+	public int getWordCost(int wordId) {
+		return WORD_COST;
+	}
+
+	@Override
+	public String getReading(int wordId) {
+		return getFeature(wordId, 0);
+	}
+
+	@Override
+	public String getPartOfSpeech(int wordId) {
+		return getFeature(wordId, 1);
+	}
+
+	@Override
+	public String getAllFeatures(int wordId) {
+		return getFeature(wordId);
+	}
+
+	@Override
+	public String[] getAllFeaturesArray(int wordId) {
+		String allFeatures = featureEntries.get(wordId);
+		if(allFeatures == null) {
+			return null;
+		}
+		
+		return allFeatures.split(INTERNAL_SEPARATOR);		
+	}
+
+	
+	@Override
+	public String getFeature(int wordId, int... fields) {
+		String[] allFeatures = getAllFeaturesArray(wordId);
+		if (allFeatures == null) {
+			return null;
+		}
+		StringBuilder sb = new StringBuilder();
+		if (fields.length == 0) { // All features
+			for (String feature : allFeatures) {
+				sb.append(CSVUtil.quoteEscape(feature)).append(",");
+			}
+		} else if (fields.length == 1) { // One feature doesn't need to escape value
+			sb.append(allFeatures[fields[0]]).append(",");			
+		} else {
+			for (int field : fields){
+				sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+			}
+		}
+		return sb.deleteCharAt(sb.length() - 1).toString();
+	}
+
+	public static UserDictionary read(String filename) throws IOException {
+		return read(new FileInputStream(filename));
+	}
+
+	public static UserDictionary read(InputStream is) throws IOException {
+		UserDictionary dictionary = new UserDictionary();
+		BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+		String line = null;
+		int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
+		while ((line = reader.readLine()) != null) {
+			// Remove comments
+			line = line.replaceAll("#.*$", "");
+
+			// Skip empty lines or comment lines
+			if (line.trim().length() == 0) {
+				continue;
+			}
+			String[] values = CSVUtil.parse(line);
+			String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
+			String[] readings = values[2].replaceAll("  *", " ").split(" ");
+			String pos = values[3];
+
+			if (segmentation.length != readings.length) {
+				// FIXME: Should probably deal with this differently.  Exception?
+				System.out.println("This entry is not properly formatted : " + line);
+			}
+
+			int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
+			wordIdAndLength[0] = wordId;
+			for (int i = 0; i < segmentation.length; i++) {
+				wordIdAndLength[i + 1] = segmentation[i].length();
+				dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos);
+				wordId++;
+			}
+			dictionary.entries.put(values[0], wordIdAndLength);
+		}
+		reader.close();
+		return dictionary;
+	}
+
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,317 @@
+package org.apache.lucene.analysis.kuromoji.trie;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.IntBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
+import java.nio.channels.ReadableByteChannel;
+
+import org.apache.lucene.analysis.kuromoji.trie.Trie.Node;
+
+public class DoubleArrayTrie {
+	
+	public static final String FILENAME = "dat.dat";
+
+	public static final char TERMINATING_CHARACTER = '\u0001';
+
+	private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
+
+	private static final int TAIL_INITIAL_SIZE = 10000;
+	
+	private static final int TAIL_OFFSET = 10000000;
+	
+	private IntBuffer baseBuffer;
+	
+	private IntBuffer checkBuffer;
+	
+	private CharBuffer tailBuffer;
+	
+	private int tailIndex = TAIL_OFFSET;
+	
+
+	public DoubleArrayTrie(){
+	}
+
+	/**
+	 * Write to file
+	 * @param filename filename
+	 * @throws IOException
+	 */
+	public void write(String directoryname) throws IOException  {
+		String filename = directoryname + File.separator + FILENAME;
+
+		baseBuffer.rewind();
+		checkBuffer.rewind();
+		tailBuffer.rewind();
+		
+		File file = new File(filename);
+		if(file.exists()){
+			file.delete();
+		}
+		
+		RandomAccessFile raf = new RandomAccessFile(filename, "rw");
+		FileChannel channel = raf.getChannel();
+		raf.writeInt(baseBuffer.capacity());
+		raf.writeInt(tailBuffer.capacity());		
+
+		ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
+		IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
+		tmpIntBuffer.put(baseBuffer);
+		tmpBuffer.rewind();
+		channel.write(tmpBuffer);
+		
+		tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
+		tmpIntBuffer = tmpBuffer.asIntBuffer();
+		tmpIntBuffer.put(checkBuffer);
+		tmpBuffer.rewind();
+		channel.write(tmpBuffer);
+		
+		tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
+		CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
+		tmpCharBuffer.put(tailBuffer);
+		tmpBuffer.rewind();
+		channel.write(tmpBuffer);
+		
+		raf.close();
+	}
+	
+	public static DoubleArrayTrie getInstance() throws IOException {
+		InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(FILENAME);
+		return read(is);
+	}
+	
+	/**
+	 * Load Stored data
+	 * @param is
+	 * @return
+	 * @throws IOException
+	 */
+	public static DoubleArrayTrie read(InputStream is) throws IOException {
+		DoubleArrayTrie trie = new DoubleArrayTrie();
+		DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
+		int baseCheckSize = dis.readInt();	// Read size of baseArr and checkArr
+		int tailSize = dis.readInt();		// Read size of tailArr
+		ReadableByteChannel channel = Channels.newChannel(dis);
+
+
+		ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);	// The size is 4 times the baseCheckSize since it is the length of array
+		channel.read(tmpBaseBuffer);
+		tmpBaseBuffer.rewind();
+		trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
+
+		ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
+		channel.read(tmpCheckBuffer);
+		tmpCheckBuffer.rewind();
+		trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
+
+		ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2);			// The size is 2 times the tailSize since it is the length of array
+		channel.read(tmpTailBuffer);
+		tmpTailBuffer.rewind();
+		trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
+
+		is.close();
+		return trie;
+	}
+	
+	/**
+	 * Construct double array trie which is equivalent to input trie
+	 * @param trie normal trie which contains all dictionary words
+	 */
+	public void build(Trie trie) {
+		baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+		baseBuffer.put(0, 1);
+		checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+		tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
+		add(-1, 0, trie.getRoot());
+	}
+	
+	/**
+	 * Add Node(character) to double array trie
+	 * @param previous
+	 * @param index
+	 * @param node
+	 */
+	private void add(int previous, int index, Node node) {
+		Node[] children = node.getChildren();	// nodes following current node
+
+		if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) {	// If node has only one path, put the rest in tail array
+			baseBuffer.put(index, tailIndex);	// current index of tail array
+			addToTail(node.children[0]);
+			checkBuffer.put(index, previous);
+			return;	// No more child to process
+		}
+
+		int base = findBase(index, children);	// Get base value for current index
+		baseBuffer.put(index, base);
+		
+		if(previous >= 0){
+			checkBuffer.put(index, previous);	// Set check value
+		}
+				
+		for(Trie.Node child : children) {	// For each child to double array trie
+			add(index, index + base + child.getKey(), child);
+		}
+		
+	}
+	
+	/**
+	 * Match input keyword.
+	 * @param key key to match
+	 * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
+	 */
+	public int lookup(String key) {
+		int index = 0;
+		int base = 1; // base at index 0 should be 1
+
+		int keyLength = key.length();
+		for(int i = 0; i < keyLength; i++) {
+			int previous = index;
+			index = index + base + key.charAt(i);
+			
+			if(index > baseBuffer.limit()) { // Too long
+				return -1;
+			}
+			
+			base = baseBuffer.get(index);
+			
+			if (base == 0 ) { // Didn't find match
+				return -1;
+			}
+
+			if(checkBuffer.get(index) != previous){	// check doesn't match
+				return -1;
+			}
+
+			if(base >= TAIL_OFFSET) {	// If base is bigger than TAIL_OFFSET, start processing "tail"
+				return matchTail(base, index, key.substring(i + 1));
+			}
+
+		}
+
+		// If we reach at the end of input keyword, check if it is complete match by looking for following terminating character		
+		int endIndex = index + base + TERMINATING_CHARACTER;
+		
+		return checkBuffer.get(endIndex) == index ? index : 0;
+	}
+
+	/**
+	 * Check match in tail array
+	 * @param base
+	 * @param index
+	 * @param key
+	 * @return	index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
+	 */
+	private int matchTail(int base, int index, String key) {
+		int positionInTailArr = base - TAIL_OFFSET;
+		
+		int keyLength = key.length();
+		for(int i = 0; i < keyLength; i++) {
+			if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
+				return -1;
+			}
+		}
+		return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
+		
+	}
+
+	/**
+	 * Find base value for current node, which contains input nodes. They are children of current node.
+	 * Set default base value , which is one, at the index of each input node.
+	 * @param index
+	 * @param nodes
+	 * @return	base value for current node
+	 */
+	private int findBase(int index, Node[] nodes){
+		int base = baseBuffer.get(index);
+		if(base < 0) {
+			return base;
+		}
+		
+		while(true) {
+			boolean collision = false;	// already taken?
+			for(Node node : nodes) {
+				/*
+				 * NOTE:
+				 * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
+				 * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
+				 * Although the size of jar file doesn't change, memory consumption will be smaller.
+				 */
+				int nextIndex = index + base + node.getKey();
+				
+				if(baseBuffer.capacity() <= nextIndex) {
+					int newLength = nextIndex + 1;
+					IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+					baseBuffer.rewind();
+					newBaseBuffer.put(baseBuffer);
+					baseBuffer = newBaseBuffer;
+					IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+					checkBuffer.rewind();
+					newCheckBuffer.put(checkBuffer);
+					checkBuffer = newCheckBuffer;
+				}
+				
+				if(baseBuffer.get(nextIndex) != 0) {	// already taken
+					base++;	// check next base value
+					collision = true;
+					break;
+				}
+			}
+			
+			if(!collision){
+				break;	// if there is no collision, found proper base value. Break the while loop.
+			}
+			
+		}
+
+		for(Node node : nodes) {
+			baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1);	// Set -1 if key is terminating character. Set default base value 1 if not.
+		}
+
+		return base;
+	}
+	
+	/**
+	 * Add characters(nodes) to tail array
+	 * @param node
+	 */
+	private void addToTail(Node node) {
+		while(true) {
+			if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
+				CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
+				tailBuffer.rewind();
+				newTailBuffer.put(tailBuffer);
+				tailBuffer = newTailBuffer;
+			}
+			tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
+
+			if(node.getChildren().length == 0) {	// if it reached the end of input, break.
+				break;
+			}
+			node = node.getChildren()[0];	// Move to next node
+		}
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,148 @@
+package org.apache.lucene.analysis.kuromoji.trie;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Trie {
+	
+	private Node root;	// Root node of Trie
+
+	/**
+	 * Constructor
+	 * Initialize Trie with empty root node
+	 */
+	public Trie() {
+		root = new Node();
+	}
+
+	/**
+	 * Add input value into Trie
+	 * Before adding, it adds terminating character(\u0001) to input string
+	 * @param value String to add to Trie
+	 */
+	public void add(String value) {
+		root.add(value + DoubleArrayTrie.TERMINATING_CHARACTER);
+	}
+
+	/**
+	 * Return root node which contains other nodes
+	 * @return	Node
+	 */
+	public Node getRoot() {
+		return root;
+	}
+
+	/**
+	 * Trie Node
+	 */
+	public class Node {
+		char key;						// key(char) of this node
+		
+		Node[] children = new Node[0];	// Array to hold children nodes
+
+		/**
+		 * Constructor
+		 */
+		public Node() {
+		}
+
+		/**
+		 * Constructor
+		 * @param key key for this node
+		 */
+		public Node(char key) {
+			this.key = key;
+		}
+
+		/**
+		 * Add string to Trie
+		 * @param value String to add
+		 */
+		public void add(String value) {
+			if (value.length() == 0) {
+				return;
+			}
+			
+			Node node = new Node(value.charAt(0));
+			addChild(node).add(value.substring(1));
+		}
+
+		/**
+		 * Add Node to this node as child
+		 * @param newNode node to add
+		 * @return added node. If a node with same key already exists, return that node.
+		 */
+		public Node addChild(Node newNode) {
+			Node child = getChild(newNode.getKey());
+			if (child == null) {
+				Node[] newChildren = new Node[children.length + 1];
+				System.arraycopy(children, 0, newChildren, 0, children.length);
+				newChildren[newChildren.length -1] = newNode;
+				children = newChildren;
+				child = newNode;
+			}
+			return child;
+		}
+
+		/**
+		 * Return the key of the node
+		 * @return key
+		 */
+		public char getKey() {
+			return key;
+		}
+		
+		/**
+		 * Check if children following this node has only single path.
+		 * For example, if you have "abcde" and "abfgh" in Trie, calling this method on node "a" and "b" returns false.
+		 * Calling this method on "c", "d", "e", "f", "g" and "h" returns true.
+		 * @return true if it has only single path. false if it has multiple path.
+		 */
+		public boolean hasSinglePath() {
+			switch(children.length){
+			case 0:
+				return true;
+			case 1:
+				return children[0].hasSinglePath();
+			default:
+				return false;
+			}
+		}
+
+		/**
+		 * Return children node
+		 * @return Array of children nodes
+		 */
+		public Node[] getChildren() {
+			return children;
+		}
+
+		/**
+		 * Return node which has input key
+		 * @param key key to look for
+		 * @return node which has input key. null if it doesn't exist.
+		 */
+		private Node getChild(char key) {
+			for (Node child : children) {
+				if (child.getKey() == key) {
+					return child;
+				}
+			}
+			return null;
+		}
+	}
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java Tue Jan  3 04:22:59 2012
@@ -0,0 +1,101 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class CSVUtil {
+	private static final char QUOTE = '"';
+	
+	private static final char COMMA = ',';
+
+	private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
+
+	private static final String ESCAPED_QUOTE = "\"\"";
+	
+	/**
+	 * Parse CSV line
+	 * @param line
+	 * @return Array of values
+	 */
+	public static String[] parse(String line) {
+		boolean insideQuote = false;
+		ArrayList<String> result = new ArrayList<String>();		
+		int quoteCount = 0;
+		StringBuilder sb = new StringBuilder();
+		for(int i = 0; i < line.length(); i++) {
+			char c = line.charAt(i);
+
+			if(c == QUOTE) {
+				insideQuote = !insideQuote;
+				quoteCount++;
+			}
+			
+			if(c == COMMA && !insideQuote) {
+				String value = sb.toString();
+				value = unQuoteUnEscape(value);
+				result.add(value);
+				sb = new StringBuilder();
+				continue;
+			}
+			
+			sb.append(c);
+		}
+		
+		result.add(sb.toString());
+
+		// Validate
+		if(quoteCount % 2 != 0) {
+			return new String[0];
+		}
+		
+		return result.toArray(new String[result.size()]);
+	}
+	
+	private static String unQuoteUnEscape(String original) {
+		String result = original;
+		
+		// Unquote
+		Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
+		if(m.matches()) {
+			result = m.group(1);
+		}
+		
+		// Unescape
+		result = result.replaceAll(ESCAPED_QUOTE, "\"");
+		
+		return result;
+		
+	}
+	
+	/**
+	 * Quote and escape input value for CSV
+	 * @param original
+	 * @return
+	 */
+	public static String quoteEscape(String original) {
+		String result = original.replaceAll("\"", ESCAPED_QUOTE);
+		if(result.indexOf(COMMA) >= 0) {
+			result = "\"" + result + "\"";
+		}
+		return result;
+	}
+
+}