You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/12 21:10:52 UTC
svn commit: r1230748 [3/5] - in /lucene/dev/trunk: dev-tools/eclipse/
lucene/contrib/ modules/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/util/
modules/analysis/common/src/test/org/apache/lucene/analysis/util/
modules/analysis...
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,1023 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+public class ToStringUtil {
+ // a translation map for parts of speech, only used for reflectWith
+ private static final HashMap<String,String> posTranslations = new HashMap<String,String>();
+ static {
+ posTranslations.put("åè©", "noun");
+ posTranslations.put("åè©-ä¸è¬", "noun-common");
+ posTranslations.put("åè©-åºæåè©", "noun-proper");
+ posTranslations.put("åè©-åºæåè©-ä¸è¬", "noun-proper-misc");
+ posTranslations.put("åè©-åºæåè©-人å", "noun-proper-person");
+ posTranslations.put("åè©-åºæåè©-人å-ä¸è¬", "noun-proper-person-misc");
+ posTranslations.put("åè©-åºæåè©-人å-å§", "noun-proper-person-surname");
+ posTranslations.put("åè©-åºæåè©-人å-å", "noun-proper-person-given_name");
+ posTranslations.put("åè©-åºæåè©-çµç¹", "noun-proper-organization");
+ posTranslations.put("åè©-åºæåè©-å°å", "noun-proper-place");
+ posTranslations.put("åè©-åºæåè©-å°å-ä¸è¬", "noun-proper-place-misc");
+ posTranslations.put("åè©-åºæåè©-å°å-å½", "noun-proper-place-country");
+ posTranslations.put("åè©-代åè©", "noun-pronoun");
+ posTranslations.put("åè©-代åè©-ä¸è¬", "noun-pronoun-misc");
+ posTranslations.put("åè©-代åè©-縮ç´", "noun-pronoun-contraction");
+ posTranslations.put("åè©-å¯è©å¯è½", "noun-adverbial");
+ posTranslations.put("åè©-ãµå¤æ¥ç¶", "noun-verbal");
+ posTranslations.put("åè©-形容åè©èªå¹¹", "noun-adjective-base");
+ posTranslations.put("åè©-æ°", "noun-numeric");
+ posTranslations.put("åè©-éèªç«", "noun-affix");
+ posTranslations.put("åè©-éèªç«-ä¸è¬", "noun-affix-misc");
+ posTranslations.put("åè©-éèªç«-å¯è©å¯è½", "noun-affix-adverbial");
+ posTranslations.put("åè©-éèªç«-å©åè©èªå¹¹", "noun-affix-aux");
+ posTranslations.put("åè©-éèªç«-形容åè©èªå¹¹", "noun-affix-adjective-base");
+ posTranslations.put("åè©-ç¹æ®", "noun-special");
+ posTranslations.put("åè©-ç¹æ®-å©åè©èªå¹¹", "noun-special-aux");
+ posTranslations.put("åè©-æ¥å°¾", "noun-suffix");
+ posTranslations.put("åè©-æ¥å°¾-ä¸è¬", "noun-suffix-misc");
+ posTranslations.put("åè©-æ¥å°¾-人å", "noun-suffix-person");
+ posTranslations.put("åè©-æ¥å°¾-å°å", "noun-suffix-place");
+ posTranslations.put("åè©-æ¥å°¾-ãµå¤æ¥ç¶", "noun-suffix-verbal");
+ posTranslations.put("åè©-æ¥å°¾-å©åè©èªå¹¹", "noun-suffix-aux");
+ posTranslations.put("åè©-æ¥å°¾-形容åè©èªå¹¹", "noun-suffix-adjective-base");
+ posTranslations.put("åè©-æ¥å°¾-å¯è©å¯è½", "noun-suffix-adverbial");
+ posTranslations.put("åè©-æ¥å°¾-å©æ°è©", "noun-suffix-classifier");
+ posTranslations.put("åè©-æ¥å°¾-ç¹æ®", "noun-suffix-special");
+ posTranslations.put("åè©-æ¥ç¶è©ç", "noun-suffix-conjunctive");
+ posTranslations.put("åè©-åè©éèªç«ç", "noun-verbal_aux");
+ posTranslations.put("åè©-å¼ç¨æåå", "noun-quotation");
+ posTranslations.put("åè©-ãã¤å½¢å®¹è©èªå¹¹", "noun-nai_adjective");
+ posTranslations.put("æ¥é è©", "prefix");
+ posTranslations.put("æ¥é è©-åè©æ¥ç¶", "prefix-nominal");
+ posTranslations.put("æ¥é è©-åè©æ¥ç¶", "prefix-verbal");
+ posTranslations.put("æ¥é è©-形容è©æ¥ç¶", "prefix-adjectival");
+ posTranslations.put("æ¥é è©-æ°æ¥ç¶", "prefix-numerical");
+ posTranslations.put("åè©", "verb");
+ posTranslations.put("åè©-èªç«", "verb-main");
+ posTranslations.put("åè©-éèªç«", "verb-auxiliary");
+ posTranslations.put("åè©-æ¥å°¾", "verb-suffix");
+ posTranslations.put("形容è©", "adjective");
+ posTranslations.put("形容è©-èªç«", "adjective-main");
+ posTranslations.put("形容è©-éèªç«", "adjective-auxiliary");
+ posTranslations.put("形容è©-æ¥å°¾", "adjective-suffix");
+ posTranslations.put("å¯è©", "adverb");
+ posTranslations.put("å¯è©-ä¸è¬", "adverb-misc");
+ posTranslations.put("å¯è©-å©è©é¡æ¥ç¶", "adverb-particle_conjunction");
+ posTranslations.put("é£ä½è©", "adnominal");
+ posTranslations.put("æ¥ç¶è©", "conjunction");
+ posTranslations.put("å©è©", "particle");
+ posTranslations.put("å©è©-æ ¼å©è©", "particle-case");
+ posTranslations.put("å©è©-æ ¼å©è©-ä¸è¬", "particle-case-misc");
+ posTranslations.put("å©è©-æ ¼å©è©-å¼ç¨", "particle-case-quote");
+ posTranslations.put("å©è©-æ ¼å©è©-é£èª", "particle-case-compound");
+ posTranslations.put("å©è©-æ¥ç¶å©è©", "particle-conjunctive");
+ posTranslations.put("å©è©-ä¿å©è©", "particle-dependency");
+ posTranslations.put("å©è©-å¯å©è©", "particle-adverbial");
+ posTranslations.put("å©è©-éæå©è©", "particle-interjective");
+ posTranslations.put("å©è©-並ç«å©è©", "particle-coordinate");
+ posTranslations.put("å©è©-çµå©è©", "particle-final");
+ posTranslations.put("å©è©-å¯å©è©ï¼ä¸¦ç«å©è©ï¼çµå©è©", "particle-adverbial/conjunctive/final");
+ posTranslations.put("å©è©-é£ä½å", "particle-adnominalizer");
+ posTranslations.put("å©è©-å¯è©å", "particle-adnominalizer");
+ posTranslations.put("å©è©-ç¹æ®", "particle-special");
+ posTranslations.put("å©åè©", "auxiliary-verb");
+ posTranslations.put("æåè©", "interjection");
+ posTranslations.put("è¨å·", "symbol");
+ posTranslations.put("è¨å·-ä¸è¬", "symbol-misc");
+ posTranslations.put("è¨å·-å¥ç¹", "symbol-period");
+ posTranslations.put("è¨å·-èªç¹", "symbol-comma");
+ posTranslations.put("è¨å·-空ç½", "symbol-space");
+ posTranslations.put("è¨å·-æ¬å¼§é", "symbol-open_bracket");
+ posTranslations.put("è¨å·-æ¬å¼§é", "symbol-close_bracket");
+ posTranslations.put("è¨å·-ã¢ã«ãã¡ããã", "symbol-alphabetic");
+ posTranslations.put("ãã®ä»", "other");
+ posTranslations.put("ãã®ä»-éæ", "other-interjection");
+ posTranslations.put("ãã£ã©ã¼", "filler");
+ posTranslations.put("éè¨èªé³", "non-verbal");
+ posTranslations.put("èªæç", "fragment");
+ posTranslations.put("æªç¥èª", "unknown");
+ }
+
+ /**
+ * Get the english form of a POS tag
+ */
+ public static String getPOSTranslation(String s) {
+ return posTranslations.get(s);
+ }
+
+ // a translation map for inflection types, only used for reflectWith
+ private static final HashMap<String,String> inflTypeTranslations = new HashMap<String,String>();
+ static {
+ inflTypeTranslations.put("*", "*");
+ inflTypeTranslations.put("形容è©ã»ã¢ã¦ãªæ®µ", "adj-group-a-o-u");
+ inflTypeTranslations.put("形容è©ã»ã¤æ®µ", "adj-group-i");
+ inflTypeTranslations.put("形容è©ã»ã¤ã¤", "adj-group-ii");
+ inflTypeTranslations.put("ä¸å¤åå", "non-inflectional");
+ inflTypeTranslations.put("ç¹æ®ã»ã¿", "special-da");
+ inflTypeTranslations.put("ç¹æ®ã»ã", "special-ta");
+ inflTypeTranslations.put("æèªã»ã´ãã·", "classical-gotoshi");
+ inflTypeTranslations.put("ç¹æ®ã»ã¸ã£", "special-ja");
+ inflTypeTranslations.put("ç¹æ®ã»ãã¤", "special-nai");
+ inflTypeTranslations.put("äºæ®µã»ã©è¡ç¹æ®", "5-row-cons-r-special");
+ inflTypeTranslations.put("ç¹æ®ã»ã", "special-nu");
+ inflTypeTranslations.put("æèªã»ã", "classical-ki");
+ inflTypeTranslations.put("ç¹æ®ã»ã¿ã¤", "special-tai");
+ inflTypeTranslations.put("æèªã»ãã·", "classical-beshi");
+ inflTypeTranslations.put("ç¹æ®ã»ã¤", "special-ya");
+ inflTypeTranslations.put("æèªã»ãã¸", "classical-maji");
+ inflTypeTranslations.put("ä¸äºã»ã¿è¡", "2-row-lower-cons-t");
+ inflTypeTranslations.put("ç¹æ®ã»ãã¹", "special-desu");
+ inflTypeTranslations.put("ç¹æ®ã»ãã¹", "special-masu");
+ inflTypeTranslations.put("äºæ®µã»ã©è¡ã¢ã«", "5-row-aru");
+ inflTypeTranslations.put("æèªã»ããª", "classical-nari");
+ inflTypeTranslations.put("æèªã»ãª", "classical-ri");
+ inflTypeTranslations.put("æèªã»ã±ãª", "classical-keri");
+ inflTypeTranslations.put("æèªã»ã«", "classical-ru");
+ inflTypeTranslations.put("äºæ®µã»ã«è¡ã¤é³ä¾¿", "5-row-cons-k-i-onbin");
+ inflTypeTranslations.put("äºæ®µã»ãµè¡", "5-row-cons-s");
+ inflTypeTranslations.put("ä¸æ®µ", "1-row");
+ inflTypeTranslations.put("äºæ®µã»ã¯è¡ä¿é³ä¾¿", "5-row-cons-w-cons-onbin");
+ inflTypeTranslations.put("äºæ®µã»ãè¡", "5-row-cons-m");
+ inflTypeTranslations.put("äºæ®µã»ã¿è¡", "5-row-cons-t");
+ inflTypeTranslations.put("äºæ®µã»ã©è¡", "5-row-cons-r");
+ inflTypeTranslations.put("ãµå¤ã»âã¹ã«", "irregular-suffix-suru");
+ inflTypeTranslations.put("äºæ®µã»ã¬è¡", "5-row-cons-g");
+ inflTypeTranslations.put("ãµå¤ã»âãºã«", "irregular-suffix-zuru");
+ inflTypeTranslations.put("äºæ®µã»ãè¡", "5-row-cons-b");
+ inflTypeTranslations.put("äºæ®µã»ã¯è¡ã¦é³ä¾¿", "5-row-cons-w-u-onbin");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-lower-cons-d");
+ inflTypeTranslations.put("äºæ®µã»ã«è¡ä¿é³ä¾¿ã¦ã¯", "5-row-cons-k-cons-onbin-yuku");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-upper-cons-d");
+ inflTypeTranslations.put("äºæ®µã»ã«è¡ä¿é³ä¾¿", "5-row-cons-k-cons-onbin");
+ inflTypeTranslations.put("ä¸æ®µã»å¾ã«", "1-row-eru");
+ inflTypeTranslations.put("å段ã»ã¿è¡", "4-row-cons-t");
+ inflTypeTranslations.put("äºæ®µã»ãè¡", "5-row-cons-n");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-lower-cons-h");
+ inflTypeTranslations.put("å段ã»ãè¡", "4-row-cons-h");
+ inflTypeTranslations.put("å段ã»ãè¡", "4-row-cons-b");
+ inflTypeTranslations.put("ãµå¤ã»ã¹ã«", "irregular-suru");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-upper-cons-h");
+ inflTypeTranslations.put("ä¸äºã»ãè¡", "2-row-lower-cons-m");
+ inflTypeTranslations.put("å段ã»ãµè¡", "4-row-cons-s");
+ inflTypeTranslations.put("ä¸äºã»ã¬è¡", "2-row-lower-cons-g");
+ inflTypeTranslations.put("ã«å¤ã»æ¥ã«", "kuru-kanji");
+ inflTypeTranslations.put("ä¸æ®µã»ã¯ã¬ã«", "1-row-kureru");
+ inflTypeTranslations.put("ä¸äºã»å¾", "2-row-lower-u");
+ inflTypeTranslations.put("ã«å¤ã»ã¯ã«", "kuru-kana");
+ inflTypeTranslations.put("ã©å¤", "irregular-cons-r");
+ inflTypeTranslations.put("ä¸äºã»ã«è¡", "2-row-lower-cons-k");
+ }
+
+ /**
+ * Get the english form of inflection type
+ */
+ public static String getInflectionTypeTranslation(String s) {
+ return inflTypeTranslations.get(s);
+ }
+
+ // a translation map for inflection forms, only used for reflectWith
+ private static final HashMap<String,String> inflFormTranslations = new HashMap<String,String>();
+ static {
+ inflFormTranslations.put("*", "*");
+ inflFormTranslations.put("åºæ¬å½¢", "base");
+ inflFormTranslations.put("æèªåºæ¬å½¢", "classical-base");
+ inflFormTranslations.put("æªç¶ãæ¥ç¶", "imperfective-nu-connection");
+ inflFormTranslations.put("æªç¶ã¦æ¥ç¶", "imperfective-u-connection");
+ inflFormTranslations.put("é£ç¨ã¿æ¥ç¶", "conjunctive-ta-connection");
+ inflFormTranslations.put("é£ç¨ãæ¥ç¶", "conjunctive-te-connection");
+ inflFormTranslations.put("é£ç¨ã´ã¶ã¤æ¥ç¶", "conjunctive-gozai-connection");
+ inflFormTranslations.put("ä½è¨æ¥ç¶", "uninflected-connection");
+ inflFormTranslations.put("ä»®å®å½¢", "subjunctive");
+ inflFormTranslations.put("å½ä»¤ï½
", "imperative-e");
+ inflFormTranslations.put("ä»®å®ç¸®ç´ï¼", "conditional-contracted-1");
+ inflFormTranslations.put("ä»®å®ç¸®ç´ï¼", "conditional-contracted-2");
+ inflFormTranslations.put("ã¬ã«æ¥ç¶", "garu-connection");
+ inflFormTranslations.put("æªç¶å½¢", "imperfective");
+ inflFormTranslations.put("é£ç¨å½¢", "conjunctive");
+ inflFormTranslations.put("é³ä¾¿åºæ¬å½¢", "onbin-base");
+ inflFormTranslations.put("é£ç¨ãæ¥ç¶", "conjunctive-de-connection");
+ inflFormTranslations.put("æªç¶ç¹æ®", "imperfective-special");
+ inflFormTranslations.put("å½ä»¤ï½", "imperative-i");
+ inflFormTranslations.put("é£ç¨ãæ¥ç¶", "conjunctive-ni-connection");
+ inflFormTranslations.put("å½ä»¤ï½ï½", "imperative-yo");
+ inflFormTranslations.put("ä½è¨æ¥ç¶ç¹æ®", "adnominal-special");
+ inflFormTranslations.put("å½ä»¤ï½ï½", "imperative-ro");
+ inflFormTranslations.put("ä½è¨æ¥ç¶ç¹æ®ï¼", "uninflected-special-connection-2");
+ inflFormTranslations.put("æªç¶ã¬ã«æ¥ç¶", "imperfective-reru-connection");
+ inflFormTranslations.put("ç¾ä»£åºæ¬å½¢", "modern-base");
+ inflFormTranslations.put("åºæ¬å½¢-ä¿é³ä¾¿", "base-onbin"); // not sure about this
+ }
+
+ /**
+ * Get the english form of inflected form
+ */
+ public static String getInflectedFormTranslation(String s) {
+ return inflFormTranslations.get(s);
+ }
+
+ /**
+ * Romanize katakana with modified hepburn
+ */
+ public static String getRomanization(String s) {
+ StringBuilder builder = new StringBuilder();
+ final int len = s.length();
+ for (int i = 0; i < len; i++) {
+ // maximum lookahead: 3
+ char ch = s.charAt(i);
+ char ch2 = (i < len - 1) ? s.charAt(i + 1) : 0;
+ char ch3 = (i < len - 2) ? s.charAt(i + 2) : 0;
+
+ main: switch (ch) {
+ case 'ã':
+ switch (ch2) {
+ case 'ã«':
+ case 'ã':
+ case 'ã¯':
+ case 'ã±':
+ case 'ã³':
+ builder.append('k');
+ break main;
+ case 'ãµ':
+ case 'ã·':
+ case 'ã¹':
+ case 'ã»':
+ case 'ã½':
+ builder.append('s');
+ break main;
+ case 'ã¿':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ builder.append('t');
+ break main;
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ builder.append('p');
+ break main;
+ }
+ break;
+ case 'ã¢':
+ builder.append('a');
+ break;
+ case 'ã¤':
+ if (ch2 == 'ã£') {
+ builder.append("yi");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("ye");
+ i++;
+ } else {
+ builder.append('i');
+ }
+ break;
+ case 'ã¦':
+ switch(ch2) {
+ case 'ã¡':
+ builder.append("wa");
+ i++;
+ break;
+ case 'ã£':
+ builder.append("wi");
+ i++;
+ break;
+ case 'ã¥':
+ builder.append("wu");
+ i++;
+ break;
+ case 'ã§':
+ builder.append("we");
+ i++;
+ break;
+ case 'ã©':
+ builder.append("wo");
+ i++;
+ break;
+ case 'ã¥':
+ builder.append("wyu");
+ i++;
+ break;
+ default:
+ builder.append('u');
+ break;
+ }
+ break;
+ case 'ã¨':
+ builder.append('e');
+ break;
+ case 'ãª':
+ if (ch2 == 'ã¦') {
+ builder.append('Å');
+ i++;
+ } else {
+ builder.append('o');
+ }
+ break;
+ case 'ã«':
+ builder.append("ka");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("kyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("kyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("kya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("kyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("kyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("kye");
+ i++;
+ } else {
+ builder.append("ki");
+ }
+ break;
+ case 'ã¯':
+ switch(ch2) {
+ case 'ã¡':
+ builder.append("kwa");
+ i++;
+ break;
+ case 'ã£':
+ builder.append("kwi");
+ i++;
+ break;
+ case 'ã§':
+ builder.append("kwe");
+ i++;
+ break;
+ case 'ã©':
+ builder.append("kwo");
+ i++;
+ break;
+ case 'ã®':
+ builder.append("kwa");
+ i++;
+ break;
+ default:
+ builder.append("ku");
+ break;
+ }
+ break;
+ case 'ã±':
+ builder.append("ke");
+ break;
+ case 'ã³':
+ if (ch2 == 'ã¦') {
+ builder.append("kÅ");
+ i++;
+ } else {
+ builder.append("ko");
+ }
+ break;
+ case 'ãµ':
+ builder.append("sa");
+ break;
+ case 'ã·':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("shÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("shū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("sha");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("sho");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("shu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("she");
+ i++;
+ } else {
+ builder.append("shi");
+ }
+ break;
+ case 'ã¹':
+ if (ch2 == 'ã£') {
+ builder.append("si");
+ i++;
+ } else {
+ builder.append("su");
+ }
+ break;
+ case 'ã»':
+ builder.append("se");
+ break;
+ case 'ã½':
+ if (ch2 == 'ã¦') {
+ builder.append("sÅ");
+ i++;
+ } else {
+ builder.append("so");
+ }
+ break;
+ case 'ã¿':
+ builder.append("ta");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("chÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("chū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("cha");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("cho");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("chu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("che");
+ i++;
+ } else {
+ builder.append("chi");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã¡') {
+ builder.append("tsa");
+ i++;
+ } else if (ch2 == 'ã£') {
+ builder.append("tsi");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("tse");
+ i++;
+ } else if (ch2 == 'ã©') {
+ builder.append("tso");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("tsyu");
+ i++;
+ } else {
+ builder.append("tsu");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã£') {
+ builder.append("ti");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("tu");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("tyu");
+ i++;
+ } else {
+ builder.append("te");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("tÅ");
+ i++;
+ } else {
+ builder.append("to");
+ }
+ break;
+ case 'ã':
+ builder.append("na");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("nyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("nyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("nya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("nyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("nyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("nye");
+ i++;
+ } else {
+ builder.append("ni");
+ }
+ break;
+ case 'ã':
+ builder.append("nu");
+ break;
+ case 'ã':
+ builder.append("ne");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("nÅ");
+ i++;
+ } else {
+ builder.append("no");
+ }
+ break;
+ case 'ã':
+ builder.append("ha");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("hyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("hyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("hya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("hyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("hyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("hye");
+ i++;
+ } else {
+ builder.append("hi");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã£') {
+ builder.append("fya");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("fyu");
+ i++;
+ } else if (ch2 == 'ã£' && ch3 == 'ã§') {
+ builder.append("fye");
+ i+=2;
+ } else if (ch2 == 'ã§') {
+ builder.append("fyo");
+ i++;
+ } else if (ch2 == 'ã¡') {
+ builder.append("fa");
+ i++;
+ } else if (ch2 == 'ã£') {
+ builder.append("fi");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("fe");
+ i++;
+ } else if (ch2 == 'ã©') {
+ builder.append("fo");
+ i++;
+ } else {
+ builder.append("fu");
+ }
+ break;
+ case 'ã':
+ builder.append("he");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("hÅ");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("hu");
+ i++;
+ } else {
+ builder.append("ho");
+ }
+ break;
+ case 'ã':
+ builder.append("ma");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("myÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("myū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("mya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("myo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("myu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("mye");
+ i++;
+ } else {
+ builder.append("mi");
+ }
+ break;
+ case 'ã ':
+ builder.append("mu");
+ break;
+ case 'ã¡':
+ builder.append("mi");
+ break;
+ case 'ã¢':
+ if (ch2 == 'ã¦') {
+ builder.append("mÅ");
+ i++;
+ } else {
+ builder.append("mo");
+ }
+ break;
+ case 'ã¤':
+ builder.append("ya");
+ break;
+ case 'ã¦':
+ builder.append("yu");
+ break;
+ case 'ã¨':
+ if (ch2 == 'ã¦') {
+ builder.append("yÅ");
+ i++;
+ } else {
+ builder.append("yo");
+ }
+ break;
+ case 'ã©':
+ builder.append("ra");
+ break;
+ case 'ãª':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("ryÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("ryū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("rya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("ryo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("ryu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("rye");
+ i++;
+ } else {
+ builder.append("ri");
+ }
+ break;
+ case 'ã«':
+ builder.append("ru");
+ break;
+ case 'ã¬':
+ builder.append("re");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("rÅ");
+ i++;
+ } else {
+ builder.append("ro");
+ }
+ break;
+ case 'ã¯':
+ builder.append("wa");
+ break;
+ case 'ã°':
+ builder.append("i");
+ break;
+ case 'ã±':
+ builder.append("e");
+ break;
+ case 'ã²':
+ builder.append("o");
+ break;
+ case 'ã³':
+ switch (ch2) {
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã':
+ case 'ã ':
+ case 'ã¡':
+ case 'ã¢':
+ builder.append('m');
+ break main;
+ case 'ã¤':
+ case 'ã¦':
+ case 'ã¨':
+ case 'ã¢':
+ case 'ã¤':
+ case 'ã¦':
+ case 'ã¨':
+ case 'ãª':
+ builder.append("n'");
+ break main;
+ default:
+ builder.append("n");
+ break main;
+ }
+ case 'ã¬':
+ builder.append("ga");
+ break;
+ case 'ã®':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("gyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("gyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("gya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("gyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("gyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("gye");
+ i++;
+ } else {
+ builder.append("gi");
+ }
+ break;
+ case 'ã°':
+ switch(ch2) {
+ case 'ã¡':
+ builder.append("gwa");
+ i++;
+ break;
+ case 'ã£':
+ builder.append("gwi");
+ i++;
+ break;
+ case 'ã§':
+ builder.append("gwe");
+ i++;
+ break;
+ case 'ã©':
+ builder.append("gwo");
+ i++;
+ break;
+ case 'ã®':
+ builder.append("gwa");
+ i++;
+ break;
+ default:
+ builder.append("gu");
+ break;
+ }
+ break;
+ case 'ã²':
+ builder.append("ge");
+ break;
+ case 'ã´':
+ if (ch2 == 'ã¦') {
+ builder.append("gÅ");
+ i++;
+ } else {
+ builder.append("go");
+ }
+ break;
+ case 'ã¶':
+ builder.append("za");
+ break;
+ case 'ã¸':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("jÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("jū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("ja");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("jo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("ju");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("je");
+ i++;
+ } else {
+ builder.append("ji");
+ }
+ break;
+ case 'ãº':
+ if (ch2 == 'ã£') {
+ builder.append("zi");
+ i++;
+ } else {
+ builder.append("zu");
+ }
+ break;
+ case 'ã¼':
+ builder.append("ze");
+ break;
+ case 'ã¾':
+ if (ch2 == 'ã¦') {
+ builder.append("zÅ");
+ i++;
+ } else {
+ builder.append("zo");
+ }
+ break;
+ case 'ã':
+ builder.append("da");
+ break;
+ case 'ã':
+ builder.append("ji");
+ break;
+ case 'ã
':
+ builder.append("zu");
+ break;
+ case 'ã':
+ if (ch2 == 'ã£') {
+ builder.append("di");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("dyu");
+ i++;
+ } else {
+ builder.append("de");
+ }
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("dÅ");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("du");
+ i++;
+ } else {
+ builder.append("do");
+ }
+ break;
+ case 'ã':
+ builder.append("ba");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("byÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("byū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("bya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("byo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("byu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("bye");
+ i++;
+ } else {
+ builder.append("bi");
+ }
+ break;
+ case 'ã':
+ builder.append("bu");
+ break;
+ case 'ã':
+ builder.append("be");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("bÅ");
+ i++;
+ } else {
+ builder.append("bo");
+ }
+ break;
+ case 'ã':
+ builder.append("pa");
+ break;
+ case 'ã':
+ if (ch2 == 'ã§' && ch3 == 'ã¦') {
+ builder.append("pyÅ");
+ i += 2;
+ } else if (ch2 == 'ã¥' && ch3 == 'ã¦') {
+ builder.append("pyū");
+ i += 2;
+ } else if (ch2 == 'ã£') {
+ builder.append("pya");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("pyo");
+ i++;
+ } else if (ch2 == 'ã¥') {
+ builder.append("pyu");
+ i++;
+ } else if (ch2 == 'ã§') {
+ builder.append("pye");
+ i++;
+ } else {
+ builder.append("pi");
+ }
+ break;
+ case 'ã':
+ builder.append("pu");
+ break;
+ case 'ã':
+ builder.append("pe");
+ break;
+ case 'ã':
+ if (ch2 == 'ã¦') {
+ builder.append("pÅ");
+ i++;
+ } else {
+ builder.append("po");
+ }
+ break;
+ case 'ã´':
+ if (ch2 == 'ã£' && ch3 == 'ã§') {
+ builder.append("vye");
+ i+= 2;
+ } else {
+ builder.append('v');
+ }
+ break;
+ case 'ã¡':
+ builder.append('a');
+ break;
+ case 'ã£':
+ builder.append('i');
+ break;
+ case 'ã¥':
+ builder.append('u');
+ break;
+ case 'ã§':
+ builder.append('e');
+ break;
+ case 'ã©':
+ builder.append('o');
+ break;
+ case 'ã®':
+ builder.append("wa");
+ break;
+ case 'ã£':
+ builder.append("ya");
+ break;
+ case 'ã¥':
+ builder.append("yu");
+ break;
+ case 'ã§':
+ builder.append("yo");
+ break;
+ case 'ã¼':
+ break;
+ default:
+ builder.append(ch);
+ }
+ }
+ return builder.toString();
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,226 @@
+package org.apache.lucene.analysis.kuromoji.viterbi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+public class GraphvizFormatter {
+
+ private final static String BOS_LABEL = "BOS";
+
+ private final static String EOS_LABEL = "EOS";
+
+ private final static String FONT_NAME = "Helvetica";
+
+ private ConnectionCosts costs;
+
+ private Map<String, ViterbiNode> nodeMap;
+
+ private Map<String, String> bestPathMap;
+
+ private boolean foundBOS;
+
+ public GraphvizFormatter(ConnectionCosts costs) {
+ this.costs = costs;
+ this.nodeMap = new HashMap<String, ViterbiNode>();
+ this.bestPathMap = new HashMap<String, String>();
+ }
+
+ public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+ initBestPathMap(null);
+
+ StringBuilder sb = new StringBuilder();
+ sb.append(formatHeader());
+ sb.append(formatNodes(startsArray, endsArray));
+ sb.append(formatTrailer());
+ return sb.toString();
+ }
+
+ public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
+
+ // List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
+ initBestPathMap(bestPath);
+
+ StringBuilder sb = new StringBuilder();
+ sb.append(formatHeader());
+ sb.append(formatNodes(startsArray, endsArray));
+ sb.append(formatTrailer());
+ return sb.toString();
+
+ }
+
+ private void initBestPathMap(List<ViterbiNode> bestPath) {
+ this.bestPathMap.clear();
+
+ if (bestPath == null){
+ return;
+ }
+ for (int i = 0; i < bestPath.size() - 1; i++) {
+ ViterbiNode from = bestPath.get(i);
+ ViterbiNode to = bestPath.get(i + 1);
+
+ String fromId = getNodeId(from);
+ String toId = getNodeId(to);
+
+ assert this.bestPathMap.containsKey(fromId) == false;
+ assert this.bestPathMap.containsValue(toId) == false;
+ this.bestPathMap.put(fromId, toId);
+ }
+ }
+
+ private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+ this.nodeMap.clear();
+ this.foundBOS = false;
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 1; i < endsArray.length; i++) {
+ if(endsArray[i] == null || startsArray[i] == null) {
+ continue;
+ }
+ for (int j = 0; j < endsArray[i].length; j++) {
+ ViterbiNode from = endsArray[i][j];
+ if(from == null){
+ continue;
+ }
+ sb.append(formatNodeIfNew(from));
+ for (int k = 0; k < startsArray[i].length; k++) {
+ ViterbiNode to = startsArray[i][k];
+ if(to == null){
+ break;
+ }
+ sb.append(formatNodeIfNew(to));
+ sb.append(formatEdge(from, to));
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+ private String formatNodeIfNew(ViterbiNode node) {
+ String nodeId = getNodeId(node);
+ if (! this.nodeMap.containsKey(nodeId)) {
+ this.nodeMap.put(nodeId, node);
+ return formatNode(node);
+ } else {
+ return "";
+ }
+ }
+
+ private String formatHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("digraph viterbi {\n");
+ sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
+ sb.append("# A2 paper size\n");
+ sb.append("size = \"34.4,16.5\";\n");
+ sb.append("# try to fill paper\n");
+ sb.append("ratio = fill;\n");
+ sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+ sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+
+ return sb.toString();
+ }
+
+ private String formatTrailer() {
+ return "}";
+ }
+
+
+ private String formatEdge(ViterbiNode from, ViterbiNode to) {
+ if (this.bestPathMap.containsKey(getNodeId(from)) &&
+ this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
+ return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
+
+ } else {
+ return formatEdge(from, to, "");
+ }
+ }
+
+
+ private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(getNodeId(from));
+ sb.append(" -> ");
+ sb.append(getNodeId(to));
+ sb.append(" [ ");
+ sb.append("label=\"");
+ sb.append(getCost(from, to));
+ sb.append("\"");
+ sb.append(" ");
+ sb.append(attributes);
+ sb.append(" ");
+ sb.append(" ]");
+ sb.append("\n");
+ return sb.toString();
+ }
+
+ private String formatNode(ViterbiNode node) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("\"");
+ sb.append(getNodeId(node));
+ sb.append("\"");
+ sb.append(" [ ");
+ sb.append("label=");
+ sb.append(formatNodeLabel(node));
+ sb.append(" ]");
+ return sb.toString();
+ }
+
+ private String formatNodeLabel(ViterbiNode node) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("<<table border=\"0\" cellborder=\"0\">");
+ sb.append("<tr><td>");
+ sb.append(getNodeLabel(node));
+ sb.append("</td></tr>");
+ sb.append("<tr><td>");
+ sb.append("<font color=\"blue\">");
+ sb.append(node.getWordCost());
+ sb.append("</font>");
+ sb.append("</td></tr>");
+ // sb.append("<tr><td>");
+ // sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
+ // sb.append("</td></tr>");
+ sb.append("</table>>");
+ return sb.toString();
+ }
+
+ private String getNodeId(ViterbiNode node) {
+ return String.valueOf(node.hashCode());
+ }
+
+ private String getNodeLabel(ViterbiNode node) {
+ if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
+ if (this.foundBOS) {
+ return EOS_LABEL;
+ } else {
+ this.foundBOS = true;
+ return BOS_LABEL;
+ }
+ } else {
+ return node.getSurfaceFormString();
+ }
+ }
+
+ private int getCost(ViterbiNode from, ViterbiNode to) {
+ return this.costs.get(from.getLeftId(), to.getRightId());
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,361 @@
+package org.apache.lucene.analysis.kuromoji.viterbi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
+import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.FST;
+
+public class Viterbi {
+
+ private final TokenInfoFST fst;
+
+ private final TokenInfoDictionary dictionary;
+
+ private final UnknownDictionary unkDictionary;
+
+ private final ConnectionCosts costs;
+
+ private final UserDictionary userDictionary;
+
+ private final CharacterDefinition characterDefinition;
+
+ private final boolean useUserDictionary;
+
+ private final boolean searchMode;
+
+ private final boolean extendedMode;
+
+ private static final int DEFAULT_COST = 10000000;
+
+ private static final int SEARCH_MODE_LENGTH_KANJI = 3;
+
+ private static final int SEARCH_MODE_LENGTH = 7;
+
+ private static final int SEARCH_MODE_PENALTY = 10000;
+
+ private static final char[] BOS = "BOS".toCharArray();
+
+ private static final char[] EOS = "EOS".toCharArray();
+
+ /**
+ * Constructor
+ */
+ public Viterbi(TokenInfoDictionary dictionary,
+ UnknownDictionary unkDictionary,
+ ConnectionCosts costs,
+ UserDictionary userDictionary,
+ Mode mode) {
+ this.dictionary = dictionary;
+ this.fst = dictionary.getFST();
+ this.unkDictionary = unkDictionary;
+ this.costs = costs;
+ this.userDictionary = userDictionary;
+ if(userDictionary == null) {
+ this.useUserDictionary = false;
+ } else {
+ this.useUserDictionary = true;
+ }
+
+ switch(mode){
+ case SEARCH:
+ searchMode = true;
+ extendedMode = false;
+ break;
+ case EXTENDED:
+ searchMode = true;
+ extendedMode = true;
+ break;
+ default:
+ searchMode = false;
+ extendedMode = false;
+ break;
+ }
+
+ this.characterDefinition = unkDictionary.getCharacterDefinition();
+ }
+
+ /**
+ * Find best path from input lattice.
+ * @param lattice the result of build method
+ * @return List of ViterbiNode which consist best path
+ */
+ public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
+ ViterbiNode[][] startIndexArr = lattice[0];
+ ViterbiNode[][] endIndexArr = lattice[1];
+
+ for (int i = 1; i < startIndexArr.length; i++){
+
+ if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
+ continue;
+ }
+
+ for (ViterbiNode node : startIndexArr[i]) {
+ if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
+ break;
+ }
+
+ int backwardConnectionId = node.getLeftId();
+ int wordCost = node.getWordCost();
+ int leastPathCost = DEFAULT_COST;
+ for (ViterbiNode leftNode : endIndexArr[i]) {
+ if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
+ break;
+ }
+
+ int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
+
+ // "Search mode". Add extra costs if it is long node.
+ if (searchMode) {
+ // System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
+ char[] surfaceForm = node.getSurfaceForm();
+ int offset = node.getOffset();
+ int length = node.getLength();
+ if (length > SEARCH_MODE_LENGTH_KANJI) {
+ boolean allKanji = true;
+ // check if node consists of only kanji
+ for (int pos = 0; pos < length; pos++) {
+ if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
+ allKanji = false;
+ break;
+ }
+ }
+
+ if (allKanji) { // Process only Kanji keywords
+ pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
+ } else if (length > SEARCH_MODE_LENGTH) {
+ pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;
+ }
+ }
+ }
+
+ if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
+ leastPathCost = pathCost;
+ node.setPathCost(leastPathCost);
+ node.setLeftNode(leftNode);
+ }
+ }
+ }
+ }
+
+ // track best path
+ ViterbiNode node = endIndexArr[0][0]; // EOS
+ LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
+ result.add(node);
+ while (true) {
+ ViterbiNode leftNode = node.getLeftNode();
+ if (leftNode == null) {
+ break;
+ }
+
+ // EXTENDED mode convert unknown word into unigram node
+ if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
+ byte unigramWordId = CharacterDefinition.NGRAM;
+ int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
+ int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
+ int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
+ char[] surfaceForm = leftNode.getSurfaceForm();
+ int offset = leftNode.getOffset();
+ int length = leftNode.getLength();
+ for (int i = length - 1; i >= 0; i--) {
+ int charLen = 1;
+ if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) {
+ i--;
+ charLen = 2;
+ }
+ ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN);
+ result.addFirst(uniGramNode);
+ }
+ } else {
+ result.addFirst(leftNode);
+ }
+ node = leftNode;
+ }
+
+ return result;
+ }
+
+ /**
+ * Build lattice from input text
+ * @param text
+ */
+ public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException {
+ ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
+ ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
+ int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
+ int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr
+ FST.Arc<Long> arc = new FST.Arc<Long>();
+ ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
+ addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+
+ // Process user dictionary;
+ if (useUserDictionary) {
+ processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+
+ int unknownWordEndIndex = -1; // index of the last character of unknown word
+
+ final IntsRef wordIdRef = new IntsRef();
+
+ for (int startIndex = 0; startIndex < length; startIndex++) {
+ // If no token ends where current token starts, skip this index
+ if (endSizeArr[startIndex + 1] == 0) {
+ continue;
+ }
+
+ int suffixStart = offset + startIndex;
+ int suffixLength = length - startIndex;
+
+ boolean found = false;
+ arc = fst.getFirstArc(arc);
+ int output = 0;
+ for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
+ int ch = text[suffixStart + endIndex - 1];
+
+ if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) {
+ break; // continue to next position
+ }
+ output += arc.output.intValue();
+
+ if (arc.isFinal()) {
+ output += arc.nextFinalOutput.intValue();
+ found = true; // Don't produce unknown word starting from this index
+ dictionary.lookupWordIds(output, wordIdRef);
+ for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+ final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
+ ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
+ addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+ }
+ }
+
+ // In the case of normal mode, it doesn't process unknown word greedily.
+ if(!searchMode && unknownWordEndIndex > startIndex){
+ continue;
+ }
+
+ // Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
+ int unknownWordLength = 0;
+ char firstCharacter = text[suffixStart];
+ boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
+ if (isInvoke){ // Process "invoke"
+ unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
+ } else if (found == false){ // Process not "invoke"
+ unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
+ }
+
+ if (unknownWordLength > 0) { // found unknown word
+ final int characterId = characterDefinition.getCharacterClass(firstCharacter);
+ unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
+ for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+ final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
+ ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
+ addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+ unknownWordEndIndex = startIndex + unknownWordLength;
+ }
+ }
+
+ ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
+ addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
+
+ ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
+
+ return result;
+ }
+
+ /**
+ * Find token(s) in input text and set found token(s) in arrays as normal tokens
+ * @param text
+ * @param startIndexArr
+ * @param endIndexArr
+ * @param startSizeArr
+ * @param endSizeArr
+ */
+ private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
+ int[][] result = userDictionary.lookup(text, offset, len);
+ for(int[] segmentation : result) {
+ int wordId = segmentation[0];
+ int index = segmentation[1];
+ int length = segmentation[2];
+ ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
+ addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+ }
+
+ /**
+ * Add node to arrays and increment count in size array
+ * @param node
+ * @param startIndex
+ * @param endIndex
+ * @param startIndexArr
+ * @param endIndexArr
+ * @param startSizeArr
+ * @param endSizeArr
+ */
+ private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
+ int startNodesCount = startSizeArr[startIndex];
+ int endNodesCount = endSizeArr[endIndex];
+
+ if (startNodesCount == 0) {
+ startIndexArr[startIndex] = new ViterbiNode[10];
+ }
+
+ if (endNodesCount == 0) {
+ endIndexArr[endIndex] = new ViterbiNode[10];
+ }
+
+ if (startIndexArr[startIndex].length <= startNodesCount){
+ startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
+ }
+
+ if (endIndexArr[endIndex].length <= endNodesCount){
+ endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
+ }
+
+ startIndexArr[startIndex][startNodesCount] = node;
+ endIndexArr[endIndex][endNodesCount] = node;
+
+ startSizeArr[startIndex] = startNodesCount + 1;
+ endSizeArr[endIndex] = endNodesCount + 1;
+ }
+
+
+ /**
+ * Return twice as big array which contains value of input array
+ * @param array
+ * @return
+ */
+ private ViterbiNode[] extendArray(ViterbiNode[] array) {
+ //extend array
+ ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
+ System.arraycopy(array, 0, newArray, 0, array.length);
+ return newArray;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,147 @@
+package org.apache.lucene.analysis.kuromoji.viterbi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public final class ViterbiNode {
+ public enum Type {
+ KNOWN,
+ UNKNOWN,
+ USER
+ }
+
+ private final int wordId;
+
+ private final char[] surfaceForm;
+ private final int offset;
+ private final int length;
+
+ private final int leftId;
+
+ private final int rightId;
+
+ /** word cost for this node */
+ private final int wordCost;
+
+ /** minimum path cost found thus far */
+ private int pathCost;
+
+ private ViterbiNode leftNode;
+
+ private final Type type;
+
+ private final int startIndex;
+
+ public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
+ this.wordId = wordId;
+ this.surfaceForm = surfaceForm;
+ this.offset = offset;
+ this.length = length;
+ this.leftId = leftId;
+ this.rightId = rightId;
+ this.wordCost = wordCost;
+ this.startIndex = startIndex;
+ this.type = type;
+ }
+
+
+ /**
+ * @return the wordId
+ */
+ public int getWordId() {
+ return wordId;
+ }
+
+ /**
+ * @return the surfaceForm
+ */
+ public char[] getSurfaceForm() {
+ return surfaceForm;
+ }
+
+ /**
+ * @return start offset into surfaceForm
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * @return length of surfaceForm
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * @return the surfaceForm as a String
+ */
+ public String getSurfaceFormString() {
+ return new String(surfaceForm, offset, length);
+ }
+
+ /**
+ * @return the leftId
+ */
+ public int getLeftId() {
+ return leftId;
+ }
+
+ /**
+ * @return the rightId
+ */
+ public int getRightId() {
+ return rightId;
+ }
+
+ /**
+ * @return the cost
+ */
+ public int getWordCost() {
+ return wordCost;
+ }
+
+ /**
+ * @return the cost
+ */
+ public int getPathCost() {
+ return pathCost;
+ }
+
+ /**
+ * param cost minimum path cost found this far
+ */
+ public void setPathCost(int pathCost) {
+ this.pathCost = pathCost;
+ }
+
+ public void setLeftNode(ViterbiNode node) {
+ leftNode = node;
+ }
+
+ public ViterbiNode getLeftNode() {
+ return leftNode;
+ }
+
+ public int getStartIndex() {
+ return startIndex;
+ }
+
+ public Type getType() {
+ return type;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html Thu Jan 12 20:10:48 2012
@@ -0,0 +1,26 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <head>
+ <title>
+ analyzers-kuromoji
+ </title>
+ </head>
+ <body>
+ analyzers-kuromoji
+ </body>
+</html>
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24fst.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24inflDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24inflDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.