You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:23:04 UTC
svn commit: r1226632 [1/2] - in /lucene/dev/branches/lucene3305:
dev-tools/eclipse/ modules/analysis/ modules/analysis/kuromoji/
modules/analysis/kuromoji/src/ modules/analysis/kuromoji/src/java/
modules/analysis/kuromoji/src/java/org/ modules/analysis...
Author: rmuir
Date: Tue Jan 3 04:22:59 2012
New Revision: 1226632
URL: http://svn.apache.org/viewvc?rev=1226632&view=rev
Log:
LUCENE-3305: current state (dictionary building and low level tests work)
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/cc.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/cd.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/dat.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/unk.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/unk_map.dat (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/bocchan.utf-8 (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/char.def.utf-8
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/tokenizer.properties (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/unk.def.utf-8
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (with props)
Modified:
lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath
lucene/dev/branches/lucene3305/modules/analysis/README.txt
lucene/dev/branches/lucene3305/modules/analysis/build.xml
Modified: lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath?rev=1226632&r1=1226631&r2=1226632&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/branches/lucene3305/dev-tools/eclipse/dot.classpath Tue Jan 3 04:22:59 2012
@@ -24,6 +24,9 @@
<classpathentry kind="src" path="modules/analysis/icu/src/java"/>
<classpathentry kind="src" path="modules/analysis/icu/src/resources"/>
<classpathentry kind="src" path="modules/analysis/icu/src/test"/>
+ <classpathentry kind="src" path="modules/analysis/kuromoji/src/java"/>
+ <classpathentry kind="src" path="modules/analysis/kuromoji/src/resources"/>
+ <classpathentry kind="src" path="modules/analysis/kuromoji/src/test"/>
<classpathentry kind="src" path="modules/analysis/phonetic/src/java"/>
<classpathentry kind="src" path="modules/analysis/phonetic/src/test"/>
<classpathentry kind="src" path="modules/analysis/smartcn/src/java"/>
Modified: lucene/dev/branches/lucene3305/modules/analysis/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/README.txt?rev=1226632&r1=1226631&r2=1226632&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/README.txt (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/README.txt Tue Jan 3 04:22:59 2012
@@ -22,6 +22,12 @@ lucene-analyzers-icu-XX.jar
International Components for Unicode (ICU). Note: this module depends on
the ICU4j jar file (version >= 4.6.0)
+lucene-analyzers-kuromoji-XX.jar
+ An analyzer with morphological analysis for Japanese.
+
+lucene-analyzers-morfologik-XX.jar
+ An analyzer using the Morfologik stemming library.
+
lucene-analyzers-phonetic-XX.jar
An add-on analysis library that provides phonetic encoders via Apache
Commons-Codec. Note: this module depends on the commons-codec jar
@@ -35,21 +41,20 @@ lucene-analyzers-stempel-XX.jar
An add-on analysis library that contains a universal algorithmic stemmer,
including tables for the Polish language.
-lucene-analyzers-morfologik-XX.jar
- An analyzer using the Morfologik stemming library.
-
common/src/java
icu/src/java
+kuromoji/src/java
+morfologik/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
-morfologik/src/java
- The source code for the ffve libraries.
+ The source code for the libraries.
common/src/test
icu/src/test
+kuromoji/src/test
+morfologik/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test
-morfologik/src/test
- Unit tests for the five libraries.
+ Unit tests for the libraries.
Modified: lucene/dev/branches/lucene3305/modules/analysis/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/build.xml?rev=1226632&r1=1226631&r2=1226632&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/build.xml (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/build.xml Tue Jan 3 04:22:59 2012
@@ -23,9 +23,10 @@
Additional Analyzers
- common: Additional Analyzers
- icu: Analyzers that use functionality from ICU
+ - kuromoji: Japanese Morphological Analyzer
+ - morfologik: Morfologik Stemmer
- smartcn: Smart Analyzer for Simplified Chinese Text
- stempel: Algorithmic Stemmer for Polish
- - morfologik: Morfologik Stemmer
</description>
<target name="common">
@@ -36,6 +37,14 @@
<ant dir="icu" />
</target>
+ <target name="kuromoji">
+ <ant dir="kuromoji" />
+ </target>
+
+ <target name="morfologik">
+ <ant dir="morfologik" />
+ </target>
+
<target name="phonetic">
<ant dir="phonetic" />
</target>
@@ -48,52 +57,53 @@
<ant dir="stempel" />
</target>
- <target name="morfologik">
- <ant dir="morfologik" />
- </target>
-
<target name="default" depends="compile"/>
- <target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
+ <target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel" />
<target name="clean">
<ant dir="common" target="clean" />
<ant dir="icu" target="clean" />
+ <ant dir="kuromoji" target="clean"/>
+ <ant dir="morfologik" target="clean" />
<ant dir="phonetic" target="clean" />
<ant dir="smartcn" target="clean" />
<ant dir="stempel" target="clean" />
- <ant dir="morfologik" target="clean" />
</target>
<target name="validate">
<ant dir="common" target="validate" />
<ant dir="icu" target="validate" />
+ <ant dir="kuromoji" target="validate" />
+ <ant dir="morfologik" target="validate" />
<ant dir="phonetic" target="validate" />
<ant dir="smartcn" target="validate" />
<ant dir="stempel" target="validate" />
- <ant dir="morfologik" target="validate" />
</target>
<target name="compile-core">
<ant dir="common" target="compile-core" />
<ant dir="icu" target="compile-core" />
+ <ant dir="kuromoji" target="compile-core" />
+ <ant dir="morfologik" target="compile-core" />
<ant dir="phonetic" target="compile-core" />
<ant dir="smartcn" target="compile-core" />
<ant dir="stempel" target="compile-core" />
- <ant dir="morfologik" target="compile-core" />
</target>
<target name="compile-test">
<ant dir="common" target="compile-test" />
<ant dir="icu" target="compile-test" />
+ <ant dir="kuromoji" target="compile-test" />
+ <ant dir="morfologik" target="compile-test" />
<ant dir="phonetic" target="compile-test" />
<ant dir="smartcn" target="compile-test" />
<ant dir="stempel" target="compile-test" />
- <ant dir="morfologik" target="compile-test" />
</target>
<target name="test">
<ant dir="common" target="test" />
<ant dir="icu" target="test" />
+ <ant dir="kuromoji" target="test" />
+ <ant dir="morfologik" target="test" />
<ant dir="phonetic" target="test" />
<ant dir="smartcn" target="test" />
<ant dir="stempel" target="test" />
- <ant dir="morfologik" target="test" />
</target>
<target name="build-artifacts-and-tests" depends="default,compile-test" />
@@ -101,28 +111,31 @@
<target name="dist-maven" depends="default,javadocs">
<ant dir="common" target="dist-maven" />
<ant dir="icu" target="dist-maven" />
+ <ant dir="kuromoji" target="dist-maven" />
+ <ant dir="morfologik" target="dist-maven" />
<ant dir="phonetic" target="dist-maven" />
<ant dir="smartcn" target="dist-maven" />
<ant dir="stempel" target="dist-maven" />
- <ant dir="morfologik" target="dist-maven" />
</target>
<target name="javadocs">
<ant dir="common" target="javadocs" />
<ant dir="icu" target="javadocs" />
+ <ant dir="kuromoji" target="javadocs" />
+ <ant dir="morfologik" target="javadocs" />
<ant dir="phonetic" target="javadocs" />
<ant dir="smartcn" target="javadocs" />
<ant dir="stempel" target="javadocs" />
- <ant dir="morfologik" target="javadocs" />
</target>
<target name="javadocs-index.html">
<ant dir="common" target="javadocs-index.html" />
<ant dir="icu" target="javadocs-index.html" />
+ <ant dir="kuromoji" target="javadocs-index.html" />
+ <ant dir="morfologik" target="javadocs-index.html" />
<ant dir="phonetic" target="javadocs-index.html" />
<ant dir="smartcn" target="javadocs-index.html" />
<ant dir="stempel" target="javadocs-index.html" />
- <ant dir="morfologik" target="javadocs-index.html" />
</target>
</project>
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml Tue Jan 3 04:22:59 2012
@@ -0,0 +1,66 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="analyzers-kuromoji" default="default">
+
+ <description>
+ Kuromoji Japanese Morphological Analyzer
+ </description>
+
+ <property name="build.dir" location="../build/kuromoji" />
+ <property name="dist.dir" location="../dist/kuromoji" />
+ <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
+ <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
+ <!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
+ <property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
+ <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
+ <property name="dict.encoding" value="euc-jp"/>
+ <property name="dict.format" value="ipadic"/>
+ <property name="dict.normalize" value="true"/>
+ <property name="dict.target.dir" location="./src/resources"/>
+ <import file="../../../lucene/contrib/contrib-build.xml"/>
+
+ <available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
+
+ <path id="classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <path refid="base.classpath"/>
+ </path>
+
+ <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+ <target name="download-dict" unless="dict.available">
+ <get src="${dict.url}" dest="${build.dir}/${dict.src.file}"/>
+ <gunzip src="${build.dir}/${dict.src.file}"/>
+ <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
+ </target>
+
+ <target name="build-dict" depends="compile-core, download-dict">
+ <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
+ <classpath>
+ <pathelement path="${classpath}"/>
+ <pathelement path="${build.dir}/classes/java"/>
+ </classpath>
+ <arg value="${dict.format}"/>
+ <arg value="${dict.src.dir}"/>
+ <arg value="${dict.target.dir}"/>
+ <arg value="${dict.encoding}"/>
+ <arg value="${dict.normalize}"/>
+ </java>
+ </target>
+</project>
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,89 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.Tokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionaries;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
+import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
+
+public class DebugTokenizer {
+
+ private GraphvizFormatter formatter;
+
+ private Viterbi viterbi;
+
+ protected DebugTokenizer(UserDictionary userDictionary, Mode mode) {
+
+ this.viterbi = new Viterbi(Dictionaries.getTrie(),
+ Dictionaries.getDictionary(),
+ Dictionaries.getUnknownDictionary(),
+ Dictionaries.getCosts(),
+ userDictionary,
+ mode);
+
+ this.formatter = new GraphvizFormatter(Dictionaries.getCosts());
+ }
+
+ public String debugTokenize(String text) {
+ ViterbiNode[][][] lattice = this.viterbi.build(text);
+ List<ViterbiNode> bestPath = this.viterbi.search(lattice);
+ return this.formatter.format(lattice[0], lattice[1], bestPath);
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+
+ private Mode mode = Mode.NORMAL;
+
+ private UserDictionary userDictionary = null;
+
+ public synchronized Builder mode(Mode mode) {
+ this.mode = mode;
+ return this;
+ }
+
+ public synchronized Builder userDictionary(InputStream userDictionaryInputStream)
+ throws IOException {
+ this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+ return this;
+ }
+
+ public synchronized Builder userDictionary(String userDictionaryPath)
+ throws FileNotFoundException, IOException {
+ this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+ return this;
+ }
+
+ public synchronized DebugTokenizer build() {
+ return new DebugTokenizer(userDictionary, mode);
+ }
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+public class Token {
+ private final Dictionary dictionary;
+
+ private final int wordId;
+
+ private final String surfaceForm;
+
+ private final int position;
+
+ private final Type type;
+
+ public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
+ this.wordId = wordId;
+ this.surfaceForm = surfaceForm;
+ this.type = type;
+ this.position = position;
+ this.dictionary = dictionary;
+ }
+
+ /**
+ * @return surfaceForm
+ */
+ public String getSurfaceForm() {
+ return surfaceForm;
+ }
+
+ /**
+ * @return all features
+ */
+ public String getAllFeatures() {
+ return dictionary.getAllFeatures(wordId);
+ }
+
+ /**
+ * @return all features as array
+ */
+ public String[] getAllFeaturesArray() {
+ return dictionary.getAllFeaturesArray(wordId);
+ }
+
+
+ /**
+ * @return reading. null if token doesn't have reading.
+ */
+ public String getReading() {
+ return dictionary.getReading(wordId);
+ }
+
+ /**
+ * @return part of speech.
+ */
+ public String getPartOfSpeech() {
+ return dictionary.getPartOfSpeech(wordId);
+ }
+
+ /**
+ * Returns true if this token is known word
+ * @return true if this token is in standard dictionary. false if not.
+ */
+ public boolean isKnown() {
+ return type == Type.KNOWN;
+ }
+
+ /**
+ * Returns true if this token is unknown word
+ * @return true if this token is unknown word. false if not.
+ */
+ public boolean isUnknown() {
+ return type == Type.UNKNOWN;
+ }
+
+ /**
+ * Returns true if this token is defined in user dictionary
+ * @return true if this token is in user dictionary. false if not.
+ */
+ public boolean isUser() {
+ return type == Type.USER;
+ }
+
+ /**
+ * Get index of this token in input text
+ * @return position of token
+ */
+ public int getPosition() {
+ return position;
+ }
+
+}
\ No newline at end of file
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,238 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionaries;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+/**
+ * Tokenizer main class.
+ * Thread safe.
+ */
+public class Tokenizer {
+ public enum Mode {
+ NORMAL, SEARCH, EXTENDED
+ }
+
+ private final Viterbi viterbi;
+
+ private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+
+ private final boolean split;
+
+ /**
+ * Constructor
+ * @param dictionary
+ * @param costs
+ * @param trie
+ * @param unkDictionary
+ * @param userDictionary
+ * @param mode
+ */
+ protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
+
+ this.viterbi = new Viterbi(Dictionaries.getTrie(),
+ Dictionaries.getDictionary(),
+ Dictionaries.getUnknownDictionary(),
+ Dictionaries.getCosts(),
+ userDictionary,
+ mode);
+
+ this.split = split;
+
+ dictionaryMap.put(Type.KNOWN, Dictionaries.getDictionary());
+ dictionaryMap.put(Type.UNKNOWN, Dictionaries.getUnknownDictionary());
+ dictionaryMap.put(Type.USER, userDictionary);
+ }
+
+ /**
+ * Tokenize input text
+ * @param text
+ * @return list of Token
+ */
+ public List<Token> tokenize(String text) {
+
+ if (!split) {
+ return doTokenize(0, text);
+ }
+
+ List<Integer> splitPositions = getSplitPositions(text);
+
+ if(splitPositions.size() == 0) {
+ return doTokenize(0, text);
+ }
+
+ ArrayList<Token> result = new ArrayList<Token>();
+ int offset = 0;
+ for(int position : splitPositions) {
+ result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
+ offset = position + 1;
+ }
+
+ if(offset < text.length()) {
+ result.addAll(doTokenize(offset, text.substring(offset)));
+ }
+
+ return result;
+ }
+
+ /**
+ * Split input text at å¥èªç¹, which is ã and ã
+ * @param text
+ * @return list of split position
+ */
+ private List<Integer> getSplitPositions(String text) {
+ ArrayList<Integer> splitPositions = new ArrayList<Integer>();
+
+ int position = 0;
+ int currentPosition = 0;
+
+ while(true) {
+ int indexOfMaru = text.indexOf("ã", currentPosition);
+ int indexOfTen = text.indexOf("ã", currentPosition);
+
+ if(indexOfMaru < 0 || indexOfTen < 0) {
+ position = Math.max(indexOfMaru, indexOfTen);;
+ } else {
+ position = Math.min(indexOfMaru, indexOfTen);
+ }
+
+ if(position >= 0) {
+ splitPositions.add(position);
+ currentPosition = position + 1;
+ } else {
+ break;
+ }
+ }
+
+ return splitPositions;
+ }
+
+ /**
+ * Tokenize input sentence.
+ * @param offset offset of sentence in original input text
+ * @param sentence sentence to tokenize
+ * @return list of Token
+ */
+ private List<Token> doTokenize(int offset, String sentence) {
+ ArrayList<Token> result = new ArrayList<Token>();
+
+ ViterbiNode[][][] lattice = viterbi.build(sentence);
+ List<ViterbiNode> bestPath = viterbi.search(lattice);
+ for (ViterbiNode node : bestPath) {
+ int wordId = node.getWordId();
+ if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
+ continue;
+ }
+ Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
+ result.add(token);
+ }
+
+ return result;
+ }
+
+ /**
+ * Get Builder to create Tokenizer instance.
+ * @return Builder
+ */
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ /**
+ * Builder class used to create Tokenizer instance.
+ */
+ public static class Builder {
+
+ private Mode mode = Mode.NORMAL;
+
+ private boolean split = true;
+
+ private UserDictionary userDictionary = null;
+
+ /**
+ * Set tokenization mode
+ * Default: NORMAL
+ * @param mode tokenization mode
+ * @return Builder
+ */
+ public synchronized Builder mode(Mode mode) {
+ this.mode = mode;
+ return this;
+ }
+
+ /**
+ * Set if tokenizer should split input string at "ã" and "ã" before tokenize to increase performance.
+ * Splitting shouldn't change the result of tokenization most of the cases.
+ * Default: true
+ *
+ * @param split whether tokenizer should split input string
+ * @return Builder
+ */
+ public synchronized Builder split(boolean split) {
+ this.split = split;
+ return this;
+ }
+
+ /**
+ * Set user dictionary input stream
+ * @param userDictionaryInputStream dictionary file as input stream
+ * @return Builder
+ * @throws IOException
+ */
+ public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
+ this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+ return this;
+ }
+
+ /**
+ * Set user dictionary path
+ * @param userDictionaryPath path to dictionary file
+ * @return Builder
+ * @throws IOException
+ * @throws FileNotFoundException
+ */
+ public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
+ if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
+ this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+ }
+ return this;
+ }
+
+ /**
+ * Create Tokenizer instance
+ * @return Tokenizer
+ */
+ public synchronized Tokenizer build() {
+ return new Tokenizer(userDictionary, mode, split);
+ }
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Serializable;
+import java.util.EnumMap;
+
+public final class CharacterDefinition implements Serializable {
+ private static final long serialVersionUID = -1436753619176638532L;
+
+ private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
+
+ private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
+ new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
+
+ public enum CharacterClass {
+ NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
+
+ public int getId() {
+ return ordinal();
+ }
+ }
+
+ /**
+ * Constructor
+ */
+ public CharacterDefinition() {
+ for (int i = 0; i < characterCategoryMap.length; i++) {
+ characterCategoryMap[i] = CharacterClass.DEFAULT;
+ }
+ }
+
+ public int lookup(char c) {
+ return characterCategoryMap[c].getId();
+ }
+
+ public CharacterClass getCharacterClass(char c) {
+ return characterCategoryMap[c];
+ }
+
+ public boolean isInvoke(char c) {
+ CharacterClass characterClass = characterCategoryMap[c];
+ int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+ return invokeDefinition[0] == 1;
+ }
+
+ public boolean isGroup(char c) {
+ CharacterClass characterClass = characterCategoryMap[c];
+ int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+ return invokeDefinition[1] == 1;
+ }
+
+ public boolean isKanji(char c) {
+ return characterCategoryMap[c] == CharacterClass.KANJI ||
+ characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
+ }
+
+ /**
+ * Put mapping from unicode code point to character class.
+ *
+ * @param codePoint
+ * code point
+ * @param class character class name
+ */
+ public void putCharacterCategory(int codePoint, String characterClassName) {
+ characterClassName = characterClassName.split(" ")[0]; // use first
+ // category
+ // class
+
+ // Override Nakaguro
+ if (codePoint == 0x30FB) {
+ characterClassName = "SYMBOL";
+ }
+ characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
+ }
+
+ public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+ CharacterClass characterClass = CharacterClass
+ .valueOf(characterClassName);
+ int[] values = { invoke, group, length };
+ invokeDefinitionMap.put(characterClass, values);
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,80 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+
+public class ConnectionCosts implements Serializable{
+
+ private static final long serialVersionUID = -7704592689635266457L;
+
+ public static final String FILENAME = "cc.dat";
+
+ private short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
+
+ public ConnectionCosts() {
+
+ }
+
+ public ConnectionCosts(int forwardSize, int backwardSize) {
+ this.costs = new short[backwardSize][forwardSize];
+ }
+
+ public void add(int forwardId, int backwardId, int cost) {
+ this.costs[backwardId][forwardId] = (short)cost;
+ }
+
+ public int get(int forwardId, int backwardId) {
+ // FIXME: There seems to be something wrong with the double array trie in some rare
+ // cases causing and IndexOutOfBoundsException. Use a guard as a temporary work-around
+ // and return a high cost to advise Mr. Viterbi strongly to not use this transition
+ if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
+ return costs[backwardId][forwardId];
+ } else {
+ return 50000;
+ }
+ }
+
+ public void write(String directoryname) throws IOException {
+ String filename = directoryname + File.separator + FILENAME;
+ ObjectOutputStream outputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+ outputStream.writeObject(this);
+ outputStream.close();
+ }
+
+ public static ConnectionCosts getInstance() throws IOException, ClassNotFoundException {
+ InputStream is = ConnectionCosts.class.getClassLoader().getResourceAsStream(FILENAME);
+ return read(is);
+ }
+
+ public static ConnectionCosts read(InputStream is) throws IOException, ClassNotFoundException {
+ ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+ ConnectionCosts instance = (ConnectionCosts) ois.readObject();
+ ois.close();
+ return instance;
+ }
+
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
+
+public final class Dictionaries {
+
+ private static TokenInfoDictionary dictionary;
+
+ private static UnknownDictionary unknownDictionary;
+
+ private static ConnectionCosts costs;
+
+ private static DoubleArrayTrie trie;
+
+ private static boolean initialized = false;
+
+ static {
+ load();
+ }
+
+ private static synchronized void load() {
+
+ if (Dictionaries.initialized) {
+ return;
+ }
+
+ try {
+ Dictionaries.dictionary = TokenInfoDictionary.getInstance();
+ Dictionaries.unknownDictionary = UnknownDictionary.getInstance();
+ Dictionaries.costs = ConnectionCosts.getInstance();
+ Dictionaries.trie = DoubleArrayTrie.getInstance();
+ Dictionaries.initialized = true;
+ } catch (Exception ex) {
+ throw new RuntimeException("Could not load dictionaries! Ouch, ouch, ouch...", ex);
+ }
+ }
+
+ /**
+ * @return the dictionary
+ */
+ public static TokenInfoDictionary getDictionary() {
+ return dictionary;
+ }
+
+ /**
+ * @param dictionary the dictionary to set
+ */
+ public static void setDictionary(TokenInfoDictionary dictionary) {
+ Dictionaries.dictionary = dictionary;
+ }
+
+ /**
+ * @return the unknownDictionary
+ */
+ public static UnknownDictionary getUnknownDictionary() {
+ return unknownDictionary;
+ }
+
+ /**
+ * @param unknownDictionary the unknownDictionary to set
+ */
+ public static void setUnknownDictionary(UnknownDictionary unknownDictionary) {
+ Dictionaries.unknownDictionary = unknownDictionary;
+ }
+
+ /**
+ * @return the costs
+ */
+ public static ConnectionCosts getCosts() {
+ return costs;
+ }
+
+ /**
+ * @param costs the costs to set
+ */
+ public static void setCosts(ConnectionCosts costs) {
+ Dictionaries.costs = costs;
+ }
+
+ /**
+ * @return the trie
+ */
+ public static DoubleArrayTrie getTrie() {
+ return trie;
+ }
+
+ /**
+ * @param trie the trie to set
+ */
+ public static void setTrie(DoubleArrayTrie trie) {
+ Dictionaries.trie = trie;
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,80 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public interface Dictionary {
+
+ public static final String INTERNAL_SEPARATOR = "\u0000";
+
+ /**
+ * Get left id of specified word
+ * @param wordId
+ * @return left id
+ */
+ public int getLeftId(int wordId);
+
+ /**
+ * Get right id of specified word
+ * @param wordId
+ * @return left id
+ */
+ public int getRightId(int wordId);
+
+ /**
+ * Get word cost of specified word
+ * @param wordId
+ * @return left id
+ */
+ public int getWordCost(int wordId);
+
+ /**
+ * Get all features of tokens
+ * @param wordId word ID of token
+ * @return All features of the token
+ */
+ public String getAllFeatures(int wordId);
+
+ /**
+ * Get all features as array
+ * @param wordId word ID of token
+ * @return Array containing all features of the token
+ */
+ public String[] getAllFeaturesArray(int wordId);
+
+ /**
+ * Get Part-Of-Speech of tokens
+ * @param wordId word ID of token
+ * @return Part-Of-Speech of the token
+ */
+ public String getPartOfSpeech(int wordId);
+
+ /**
+ * Get reading of tokens
+ * @param wordId word ID of token
+ * @return Reading of the token
+ */
+ public String getReading(int wordId);
+
+ /**
+ * Get feature(s) of tokens
+ * @param wordId word ID token
+ * @param fields array of index. If this is empty, return all features.
+ * @return Features of the token
+ */
+ public String getFeature(int wordId, int... fields);
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,244 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.ReadableByteChannel;
+import java.nio.channels.WritableByteChannel;
+
+import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
+
+public class TokenInfoDictionary implements Dictionary{
+
+ public static final String FILENAME = "tid.dat";
+
+ public static final String TARGETMAP_FILENAME = "tid_map.dat";
+
+ protected ByteBuffer buffer;
+
+ protected int[][] targetMap;
+
+ public TokenInfoDictionary() {
+ }
+
+ public TokenInfoDictionary(int size) {
+ targetMap = new int[1][];
+ buffer = ByteBuffer.allocate(size);
+ }
+
+ /**
+ * put the entry in map
+ * @param wordId
+ * @param entry
+ * @return current position of buffer, which will be wordId of next entry
+ */
+ public int put(String[] entry) {
+ short leftId = Short.parseShort(entry[1]);
+ short rightId = Short.parseShort(entry[2]);
+ short wordCost = Short.parseShort(entry[3]);
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 4; i < entry.length; i++){
+ sb.append(entry[i]).append(INTERNAL_SEPARATOR);
+ }
+ String features = sb.deleteCharAt(sb.length() - 1).toString();
+ int featuresSize = features.length()* 2;
+
+ // extend buffer if necessary
+ int left = buffer.limit() - buffer.position();
+ if (8 + featuresSize > left) { // four short and features
+ ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
+ buffer.flip();
+ newBuffer.put(buffer);
+ buffer = newBuffer;
+ }
+
+ buffer.putShort(leftId);
+ buffer.putShort(rightId);
+ buffer.putShort(wordCost);
+ buffer.putShort((short)featuresSize);
+ for (char c : features.toCharArray()){
+ buffer.putChar(c);
+ }
+
+ return buffer.position();
+ }
+
+ public void addMapping(int sourceId, int wordId) {
+ if(targetMap.length <= sourceId) {
+ int[][] newArray = new int[sourceId + 1][];
+ System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
+ targetMap = newArray;
+ }
+
+ // Prepare array -- extend the length of array by one
+ int[] current = targetMap[sourceId];
+ if (current == null) {
+ current = new int[1];
+ } else {
+ int[] newArray = new int[current.length + 1];
+ System.arraycopy(current, 0, newArray, 0, current.length);
+ current = newArray;
+ }
+ targetMap[sourceId] = current;
+
+ int[] targets = targetMap[sourceId];
+ targets[targets.length - 1] = wordId;
+ }
+
+ public int[] lookupWordIds(int sourceId) {
+ return targetMap[sourceId];
+ }
+
+ @Override
+ public int getLeftId(int wordId) {
+ return buffer.getShort(wordId);
+ }
+
+ @Override
+ public int getRightId(int wordId) {
+ return buffer.getShort(wordId + 2); // Skip left id
+ }
+
+ @Override
+ public int getWordCost(int wordId) {
+ return buffer.getShort(wordId + 4); // Skip left id and right id
+ }
+
+ @Override
+ public String[] getAllFeaturesArray(int wordId) {
+ int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
+ char[] targetArr = new char[size];
+ int offset = wordId + 6 + 2; // offset is position where features string starts
+ for(int i = 0; i < size; i++){
+ targetArr[i] = buffer.getChar(offset + i * 2);
+ }
+ String allFeatures = new String(targetArr);
+ return allFeatures.split(INTERNAL_SEPARATOR);
+ }
+
+ @Override
+ public String getFeature(int wordId, int... fields) {
+ String[] allFeatures = getAllFeaturesArray(wordId);
+ StringBuilder sb = new StringBuilder();
+
+ if(fields.length == 0){ // All features
+ for(String feature : allFeatures) {
+ sb.append(CSVUtil.quoteEscape(feature)).append(",");
+ }
+ } else if(fields.length == 1) { // One feature doesn't need to escape value
+ sb.append(allFeatures[fields[0]]).append(",");
+ } else {
+ for(int field : fields){
+ sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+ }
+ }
+
+ return sb.deleteCharAt(sb.length() - 1).toString();
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ return getFeature(wordId, 7);
+ }
+
+ @Override
+ public String getAllFeatures(int wordId) {
+ return getFeature(wordId);
+ }
+
+ @Override
+ public String getPartOfSpeech(int wordId) {
+ return getFeature(wordId, 0, 1, 2, 3);
+ }
+
+
+ /**
+ * Write dictionary in file
+ * Dictionary format is:
+ * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+ * @param filename
+ * @throws IOException
+ */
+ public void write(String directoryname) throws IOException {
+ writeDictionary(directoryname + File.separator + FILENAME);
+ writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+ }
+
+ protected void writeTargetMap(String filename) throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+ oos.writeObject(targetMap);
+ oos.close();
+ }
+
+ protected void writeDictionary(String filename) throws IOException {
+ FileOutputStream fos = new FileOutputStream(filename);
+ DataOutputStream dos = new DataOutputStream(fos);
+ dos.writeInt(buffer.position());
+ WritableByteChannel channel = Channels.newChannel(fos);
+ // Write Buffer
+ buffer.flip(); // set position to 0, set limit to current position
+ channel.write(buffer);
+
+ fos.close();
+ }
+
+ /**
+ * Read dictionary into directly allocated buffer.
+ * @return TokenInfoDictionary instance
+ * @throws IOException
+ * @throws ClassNotFoundException
+ */
+ public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
+ TokenInfoDictionary dictionary = new TokenInfoDictionary();
+ ClassLoader loader = dictionary.getClass().getClassLoader();
+ dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+ dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+ return dictionary;
+ }
+
+ protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
+ ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+ targetMap = (int[][]) ois.readObject();
+ is.close();
+ }
+
+ protected void loadDictionary(InputStream is) throws IOException {
+ DataInputStream dis = new DataInputStream(is);
+ int size = dis.readInt();
+
+ ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+
+ ReadableByteChannel channel = Channels.newChannel(is);
+ channel.read(tmpBuffer);
+ is.close();
+ buffer = tmpBuffer.asReadOnlyBuffer();
+ }
+
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,142 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition.CharacterClass;
+
+public class UnknownDictionary extends TokenInfoDictionary {
+
+ public static final String FILENAME = "unk.dat";
+
+ public static final String TARGETMAP_FILENAME = "unk_map.dat";
+
+ public static final String CHARDEF_FILENAME = "cd.dat";
+
+ private CharacterDefinition characterDefinition;
+
+ /**
+ * Constructor
+ */
+ public UnknownDictionary() {
+ }
+
+ public UnknownDictionary(int size) {
+ super(size);
+ characterDefinition = new CharacterDefinition();
+ }
+
+ @Override
+ public int put(String[] entry) {
+ // Get wordId of current entry
+ int wordId = buffer.position();
+
+ // Put entry
+ int result = super.put(entry);
+
+ // Put entry in targetMap
+ int characterId = CharacterClass.valueOf(entry[0]).getId();
+ addMapping(characterId, wordId);
+ return result;
+ }
+
+ public int lookup(String text) {
+ if(!characterDefinition.isGroup(text.charAt(0))) {
+ return 1;
+ }
+
+ // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+ int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
+ int length = 1;
+ for (int i = 1; i < text.length(); i++) {
+ if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
+ length++;
+ } else {
+ break;
+ }
+ }
+
+ return length;
+ }
+
+ /**
+ * Put mapping from unicode code point to character class.
+ *
+ * @param codePoint code point
+ * @param class character class name
+ */
+ public void putCharacterCategory(int codePoint, String characterClassName) {
+ characterDefinition.putCharacterCategory(codePoint, characterClassName);
+ }
+
+ public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+ characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
+ }
+
+
+ public CharacterDefinition getCharacterDefinition() {
+ return characterDefinition;
+ }
+
+ /**
+ * Write dictionary in file
+ * Dictionary format is:
+ * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+ * @param filename
+ * @throws IOException
+ */
+ public void write(String directoryname) throws IOException {
+ writeDictionary(directoryname + File.separator + FILENAME);
+ writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+ writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
+ }
+
+ protected void writeCharDef(String filename) throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+ oos.writeObject(characterDefinition);
+ oos.close();
+ }
+
+ public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
+ UnknownDictionary dictionary = new UnknownDictionary();
+ ClassLoader loader = dictionary.getClass().getClassLoader();
+ dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+ dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+ dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));
+ return dictionary;
+ }
+
+ protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
+ ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+ characterDefinition = (CharacterDefinition) ois.readObject();
+ ois.close();
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ return null;
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,196 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
+
+public class UserDictionary implements Dictionary {
+
+ private TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
+
+ private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>();
+
+ private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
+
+ public static final int WORD_COST = -100000;
+
+ public static final int LEFT_ID = 5;
+
+ public static final int RIGHT_ID = 5;
+
+ public UserDictionary() {
+
+ }
+
+ /**
+ * Lookup words in text
+ * @param text
+ * @return array of {wordId, position, length}
+ */
+ public int[][] lookup(String text) {
+ TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
+
+ for (String keyword : entries.descendingKeySet()) {
+ int offset = 0;
+ int position = text.indexOf(keyword, offset);
+ while (offset < text.length() && position >= 0) {
+ if(!result.containsKey(position)){
+ result.put(position, entries.get(keyword));
+ }
+ offset += position + keyword.length();
+ position = text.indexOf(keyword, offset);
+ }
+ }
+
+ return toIndexArray(result);
+ }
+
+ /**
+ * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
+ * @param input
+ * @return array of {wordId, index, length}
+ */
+ private int[][] toIndexArray(Map<Integer, int[]> input) {
+ ArrayList<int[]> result = new ArrayList<int[]>();
+ for (int i : input.keySet()) {
+ int[] wordIdAndLength = input.get(i);
+ int wordId = wordIdAndLength[0];
+ // convert length to index
+ int current = i;
+ for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
+ int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
+ result.add(token);
+ current += wordIdAndLength[j];
+ }
+ }
+ return result.toArray(new int[result.size()][]);
+ }
+
+ @Override
+ public int getLeftId(int wordId) {
+ return LEFT_ID;
+ }
+
+ @Override
+ public int getRightId(int wordId) {
+ return RIGHT_ID;
+ }
+
+ @Override
+ public int getWordCost(int wordId) {
+ return WORD_COST;
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ return getFeature(wordId, 0);
+ }
+
+ @Override
+ public String getPartOfSpeech(int wordId) {
+ return getFeature(wordId, 1);
+ }
+
+ @Override
+ public String getAllFeatures(int wordId) {
+ return getFeature(wordId);
+ }
+
+ @Override
+ public String[] getAllFeaturesArray(int wordId) {
+ String allFeatures = featureEntries.get(wordId);
+ if(allFeatures == null) {
+ return null;
+ }
+
+ return allFeatures.split(INTERNAL_SEPARATOR);
+ }
+
+
+ @Override
+ public String getFeature(int wordId, int... fields) {
+ String[] allFeatures = getAllFeaturesArray(wordId);
+ if (allFeatures == null) {
+ return null;
+ }
+ StringBuilder sb = new StringBuilder();
+ if (fields.length == 0) { // All features
+ for (String feature : allFeatures) {
+ sb.append(CSVUtil.quoteEscape(feature)).append(",");
+ }
+ } else if (fields.length == 1) { // One feature doesn't need to escape value
+ sb.append(allFeatures[fields[0]]).append(",");
+ } else {
+ for (int field : fields){
+ sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+ }
+ }
+ return sb.deleteCharAt(sb.length() - 1).toString();
+ }
+
+ public static UserDictionary read(String filename) throws IOException {
+ return read(new FileInputStream(filename));
+ }
+
+ public static UserDictionary read(InputStream is) throws IOException {
+ UserDictionary dictionary = new UserDictionary();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+ String line = null;
+ int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
+ while ((line = reader.readLine()) != null) {
+ // Remove comments
+ line = line.replaceAll("#.*$", "");
+
+ // Skip empty lines or comment lines
+ if (line.trim().length() == 0) {
+ continue;
+ }
+ String[] values = CSVUtil.parse(line);
+ String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
+ String[] readings = values[2].replaceAll(" *", " ").split(" ");
+ String pos = values[3];
+
+ if (segmentation.length != readings.length) {
+ // FIXME: Should probably deal with this differently. Exception?
+ System.out.println("This entry is not properly formatted : " + line);
+ }
+
+ int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
+ wordIdAndLength[0] = wordId;
+ for (int i = 0; i < segmentation.length; i++) {
+ wordIdAndLength[i + 1] = segmentation[i].length();
+ dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos);
+ wordId++;
+ }
+ dictionary.entries.put(values[0], wordIdAndLength);
+ }
+ reader.close();
+ return dictionary;
+ }
+
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,317 @@
+package org.apache.lucene.analysis.kuromoji.trie;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.IntBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.FileChannel;
+import java.nio.channels.ReadableByteChannel;
+
+import org.apache.lucene.analysis.kuromoji.trie.Trie.Node;
+
+public class DoubleArrayTrie {
+
+ public static final String FILENAME = "dat.dat";
+
+ public static final char TERMINATING_CHARACTER = '\u0001';
+
+ private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
+
+ private static final int TAIL_INITIAL_SIZE = 10000;
+
+ private static final int TAIL_OFFSET = 10000000;
+
+ private IntBuffer baseBuffer;
+
+ private IntBuffer checkBuffer;
+
+ private CharBuffer tailBuffer;
+
+ private int tailIndex = TAIL_OFFSET;
+
+
+ public DoubleArrayTrie(){
+ }
+
+ /**
+ * Write to file
+ * @param filename filename
+ * @throws IOException
+ */
+ public void write(String directoryname) throws IOException {
+ String filename = directoryname + File.separator + FILENAME;
+
+ baseBuffer.rewind();
+ checkBuffer.rewind();
+ tailBuffer.rewind();
+
+ File file = new File(filename);
+ if(file.exists()){
+ file.delete();
+ }
+
+ RandomAccessFile raf = new RandomAccessFile(filename, "rw");
+ FileChannel channel = raf.getChannel();
+ raf.writeInt(baseBuffer.capacity());
+ raf.writeInt(tailBuffer.capacity());
+
+ ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
+ IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
+ tmpIntBuffer.put(baseBuffer);
+ tmpBuffer.rewind();
+ channel.write(tmpBuffer);
+
+ tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
+ tmpIntBuffer = tmpBuffer.asIntBuffer();
+ tmpIntBuffer.put(checkBuffer);
+ tmpBuffer.rewind();
+ channel.write(tmpBuffer);
+
+ tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
+ CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
+ tmpCharBuffer.put(tailBuffer);
+ tmpBuffer.rewind();
+ channel.write(tmpBuffer);
+
+ raf.close();
+ }
+
+ public static DoubleArrayTrie getInstance() throws IOException {
+ InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(FILENAME);
+ return read(is);
+ }
+
+ /**
+ * Load Stored data
+ * @param is
+ * @return
+ * @throws IOException
+ */
+ public static DoubleArrayTrie read(InputStream is) throws IOException {
+ DoubleArrayTrie trie = new DoubleArrayTrie();
+ DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
+ int baseCheckSize = dis.readInt(); // Read size of baseArr and checkArr
+ int tailSize = dis.readInt(); // Read size of tailArr
+ ReadableByteChannel channel = Channels.newChannel(dis);
+
+
+ ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4); // The size is 4 times the baseCheckSize since it is the length of array
+ channel.read(tmpBaseBuffer);
+ tmpBaseBuffer.rewind();
+ trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
+
+ ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
+ channel.read(tmpCheckBuffer);
+ tmpCheckBuffer.rewind();
+ trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
+
+ ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2); // The size is 2 times the tailSize since it is the length of array
+ channel.read(tmpTailBuffer);
+ tmpTailBuffer.rewind();
+ trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
+
+ is.close();
+ return trie;
+ }
+
+ /**
+ * Construct double array trie which is equivalent to input trie
+ * @param trie normal trie which contains all dictionary words
+ */
+ public void build(Trie trie) {
+ baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+ baseBuffer.put(0, 1);
+ checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+ tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
+ add(-1, 0, trie.getRoot());
+ }
+
+ /**
+ * Add Node(character) to double array trie
+ * @param previous
+ * @param index
+ * @param node
+ */
+ private void add(int previous, int index, Node node) {
+ Node[] children = node.getChildren(); // nodes following current node
+
+ if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) { // If node has only one path, put the rest in tail array
+ baseBuffer.put(index, tailIndex); // current index of tail array
+ addToTail(node.children[0]);
+ checkBuffer.put(index, previous);
+ return; // No more child to process
+ }
+
+ int base = findBase(index, children); // Get base value for current index
+ baseBuffer.put(index, base);
+
+ if(previous >= 0){
+ checkBuffer.put(index, previous); // Set check value
+ }
+
+ for(Trie.Node child : children) { // For each child to double array trie
+ add(index, index + base + child.getKey(), child);
+ }
+
+ }
+
+ /**
+ * Match input keyword.
+ * @param key key to match
+ * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
+ */
+ public int lookup(String key) {
+ int index = 0;
+ int base = 1; // base at index 0 should be 1
+
+ int keyLength = key.length();
+ for(int i = 0; i < keyLength; i++) {
+ int previous = index;
+ index = index + base + key.charAt(i);
+
+ if(index > baseBuffer.limit()) { // Too long
+ return -1;
+ }
+
+ base = baseBuffer.get(index);
+
+ if (base == 0 ) { // Didn't find match
+ return -1;
+ }
+
+ if(checkBuffer.get(index) != previous){ // check doesn't match
+ return -1;
+ }
+
+ if(base >= TAIL_OFFSET) { // If base is bigger than TAIL_OFFSET, start processing "tail"
+ return matchTail(base, index, key.substring(i + 1));
+ }
+
+ }
+
+ // If we reach at the end of input keyword, check if it is complete match by looking for following terminating character
+ int endIndex = index + base + TERMINATING_CHARACTER;
+
+ return checkBuffer.get(endIndex) == index ? index : 0;
+ }
+
+ /**
+ * Check match in tail array
+ * @param base
+ * @param index
+ * @param key
+ * @return index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
+ */
+ private int matchTail(int base, int index, String key) {
+ int positionInTailArr = base - TAIL_OFFSET;
+
+ int keyLength = key.length();
+ for(int i = 0; i < keyLength; i++) {
+ if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
+ return -1;
+ }
+ }
+ return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
+
+ }
+
+ /**
+ * Find base value for current node, which contains input nodes. They are children of current node.
+ * Set default base value , which is one, at the index of each input node.
+ * @param index
+ * @param nodes
+ * @return base value for current node
+ */
+ private int findBase(int index, Node[] nodes){
+ int base = baseBuffer.get(index);
+ if(base < 0) {
+ return base;
+ }
+
+ while(true) {
+ boolean collision = false; // already taken?
+ for(Node node : nodes) {
+ /*
+ * NOTE:
+ * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
+ * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
+ * Although the size of jar file doesn't change, memory consumption will be smaller.
+ */
+ int nextIndex = index + base + node.getKey();
+
+ if(baseBuffer.capacity() <= nextIndex) {
+ int newLength = nextIndex + 1;
+ IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+ baseBuffer.rewind();
+ newBaseBuffer.put(baseBuffer);
+ baseBuffer = newBaseBuffer;
+ IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+ checkBuffer.rewind();
+ newCheckBuffer.put(checkBuffer);
+ checkBuffer = newCheckBuffer;
+ }
+
+ if(baseBuffer.get(nextIndex) != 0) { // already taken
+ base++; // check next base value
+ collision = true;
+ break;
+ }
+ }
+
+ if(!collision){
+ break; // if there is no collision, found proper base value. Break the while loop.
+ }
+
+ }
+
+ for(Node node : nodes) {
+ baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1); // Set -1 if key is terminating character. Set default base value 1 if not.
+ }
+
+ return base;
+ }
+
+ /**
+ * Add characters(nodes) to tail array
+ * @param node
+ */
+ private void addToTail(Node node) {
+ while(true) {
+ if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
+ CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
+ tailBuffer.rewind();
+ newTailBuffer.put(tailBuffer);
+ tailBuffer = newTailBuffer;
+ }
+ tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
+
+ if(node.getChildren().length == 0) { // if it reached the end of input, break.
+ break;
+ }
+ node = node.getChildren()[0]; // Move to next node
+ }
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,148 @@
+package org.apache.lucene.analysis.kuromoji.trie;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Trie {
+
+ private Node root; // Root node of Trie
+
+ /**
+ * Constructor
+ * Initialize Trie with empty root node
+ */
+ public Trie() {
+ root = new Node();
+ }
+
+ /**
+ * Add input value into Trie
+ * Before adding, it adds terminating character(\u0001) to input string
+ * @param value String to add to Trie
+ */
+ public void add(String value) {
+ root.add(value + DoubleArrayTrie.TERMINATING_CHARACTER);
+ }
+
+ /**
+ * Return root node which contains other nodes
+ * @return Node
+ */
+ public Node getRoot() {
+ return root;
+ }
+
+ /**
+ * Trie Node
+ */
+ public class Node {
+ char key; // key(char) of this node
+
+ Node[] children = new Node[0]; // Array to hold children nodes
+
+ /**
+ * Constructor
+ */
+ public Node() {
+ }
+
+ /**
+ * Constructor
+ * @param key key for this node
+ */
+ public Node(char key) {
+ this.key = key;
+ }
+
+ /**
+ * Add string to Trie
+ * @param value String to add
+ */
+ public void add(String value) {
+ if (value.length() == 0) {
+ return;
+ }
+
+ Node node = new Node(value.charAt(0));
+ addChild(node).add(value.substring(1));
+ }
+
+ /**
+ * Add Node to this node as child
+ * @param newNode node to add
+ * @return added node. If a node with same key already exists, return that node.
+ */
+ public Node addChild(Node newNode) {
+ Node child = getChild(newNode.getKey());
+ if (child == null) {
+ Node[] newChildren = new Node[children.length + 1];
+ System.arraycopy(children, 0, newChildren, 0, children.length);
+ newChildren[newChildren.length -1] = newNode;
+ children = newChildren;
+ child = newNode;
+ }
+ return child;
+ }
+
+ /**
+ * Return the key of the node
+ * @return key
+ */
+ public char getKey() {
+ return key;
+ }
+
+ /**
+ * Check if children following this node has only single path.
+ * For example, if you have "abcde" and "abfgh" in Trie, calling this method on node "a" and "b" returns false.
+ * Calling this method on "c", "d", "e", "f", "g" and "h" returns true.
+ * @return true if it has only single path. false if it has multiple path.
+ */
+ public boolean hasSinglePath() {
+ switch(children.length){
+ case 0:
+ return true;
+ case 1:
+ return children[0].hasSinglePath();
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Return children node
+ * @return Array of children nodes
+ */
+ public Node[] getChildren() {
+ return children;
+ }
+
+ /**
+ * Return node which has input key
+ * @param key key to look for
+ * @return node which has input key. null if it doesn't exist.
+ */
+ private Node getChild(char key) {
+ for (Node child : children) {
+ if (child.getKey() == key) {
+ return child;
+ }
+ }
+ return null;
+ }
+ }
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java?rev=1226632&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java Tue Jan 3 04:22:59 2012
@@ -0,0 +1,101 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class CSVUtil {
+ private static final char QUOTE = '"';
+
+ private static final char COMMA = ',';
+
+ private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
+
+ private static final String ESCAPED_QUOTE = "\"\"";
+
+ /**
+ * Parse CSV line
+ * @param line
+ * @return Array of values
+ */
+ public static String[] parse(String line) {
+ boolean insideQuote = false;
+ ArrayList<String> result = new ArrayList<String>();
+ int quoteCount = 0;
+ StringBuilder sb = new StringBuilder();
+ for(int i = 0; i < line.length(); i++) {
+ char c = line.charAt(i);
+
+ if(c == QUOTE) {
+ insideQuote = !insideQuote;
+ quoteCount++;
+ }
+
+ if(c == COMMA && !insideQuote) {
+ String value = sb.toString();
+ value = unQuoteUnEscape(value);
+ result.add(value);
+ sb = new StringBuilder();
+ continue;
+ }
+
+ sb.append(c);
+ }
+
+ result.add(sb.toString());
+
+ // Validate
+ if(quoteCount % 2 != 0) {
+ return new String[0];
+ }
+
+ return result.toArray(new String[result.size()]);
+ }
+
+ private static String unQuoteUnEscape(String original) {
+ String result = original;
+
+ // Unquote
+ Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
+ if(m.matches()) {
+ result = m.group(1);
+ }
+
+ // Unescape
+ result = result.replaceAll(ESCAPED_QUOTE, "\"");
+
+ return result;
+
+ }
+
+ /**
+ * Quote and escape input value for CSV
+ * @param original
+ * @return
+ */
+ public static String quoteEscape(String original) {
+ String result = original.replaceAll("\"", ESCAPED_QUOTE);
+ if(result.indexOf(COMMA) >= 0) {
+ result = "\"" + result + "\"";
+ }
+ return result;
+ }
+
+}