You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/02/16 18:47:07 UTC
svn commit: r1245098 - in /lucene/dev/branches/lucene3767: lucene/test-framework/src/java/org/apache/lucene/analysis/ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/...

Author: mikemccand
Date: Thu Feb 16 17:47:06 2012
New Revision: 1245098

URL: http://svn.apache.org/viewvc?rev=1245098&view=rev
Log:
LUCENE-3767: get toDot working for Viterbi lattice; replace old tokenizer w/ new; leave default at Mode.SEARCH; no more nocommits

Added:
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java   (with props)
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
      - copied, changed from r1244433, lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
Removed:
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
Modified:
    lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
    lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java

Modified: lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java (original)
+++ lucene/dev/branches/lucene3767/lucene/test-framework/src/java/org/apache/lucene/analysis/TokenStreamToDot.java Thu Feb 16 17:47:06 2012
@@ -67,7 +67,7 @@ public class TokenStreamToDot {
       final boolean isFirst = pos == -1;
       int posInc = posIncAtt.getPositionIncrement();
       if (isFirst && posInc == 0) {
-        // nocommit hmm are TS's still allowed to do this...?
+        // TODO: hmm are TS's still allowed to do this...?
         System.err.println("WARNING: first posInc was 0; correcting to 1");
         posInc = 1;
       }

Added: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java?rev=1245098&view=auto
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java (added)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/GraphvizFormatter.java Thu Feb 16 17:47:06 2012
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Position;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.WrappedPositionArray;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+
+
+// TODO: would be nice to show 2nd best path in a diff't
+// color...
+
+public class GraphvizFormatter {
+  
+  private final static String BOS_LABEL = "BOS";
+  
+  private final static String EOS_LABEL = "EOS";
+  
+  private final static String FONT_NAME = "Helvetica";
+  
+  private final ConnectionCosts costs;
+  
+  private final Map<String, String> bestPathMap;
+  
+  private final StringBuilder sb = new StringBuilder();
+  
+  public GraphvizFormatter(ConnectionCosts costs) {
+    this.costs = costs;
+    this.bestPathMap = new HashMap<String, String>();
+    sb.append(formatHeader());
+    sb.append("  init [style=invis]\n");
+    sb.append("  init -> 0.0 [label=\"BOS\"]\n");
+  }
+
+  public String finish() {
+    sb.append(formatTrailer());
+    return sb.toString();
+  }
+
+  // Backtraces another incremental fragment:
+  void onBacktrace(KuromojiTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, boolean isEnd) {
+    setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
+    sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
+    if (isEnd) {
+      sb.append("  fini [style=invis]\n");
+      sb.append("  ");
+      sb.append(getNodeID(endPosData.pos, fromIDX));
+      sb.append(" -> fini [label=\"EOS\"]");
+    }
+  }
+
+  // Records which arcs make up the best bath:
+  private void setBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
+    bestPathMap.clear();
+
+    int pos = endPosData.pos;
+    int bestIDX = fromIDX;
+    while (pos > startPos) {
+      final Position posData = positions.get(pos);
+
+      final int backPos = posData.backPos[bestIDX];
+      final int backIDX = posData.backIndex[bestIDX];
+
+      final String toNodeID = getNodeID(pos, bestIDX);
+      final String fromNodeID = getNodeID(backPos, backIDX);
+      
+      assert !bestPathMap.containsKey(fromNodeID);
+      assert !bestPathMap.containsValue(toNodeID);
+      bestPathMap.put(fromNodeID, toNodeID);
+      pos = backPos;
+      bestIDX = backIDX;
+    }
+  }
+  
+  private String formatNodes(KuromojiTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) {
+
+    StringBuilder sb = new StringBuilder();
+    // Output nodes
+    for (int pos = startPos+1; pos <= endPosData.pos; pos++) {
+      final Position posData = positions.get(pos);
+      for(int idx=0;idx<posData.count;idx++) {
+        sb.append("  ");
+        sb.append(getNodeID(pos, idx));
+        sb.append(" [label=\"");
+        sb.append(pos);
+        sb.append(": ");
+        sb.append(posData.lastRightID[idx]);
+        sb.append("\"]\n");
+      }
+    }
+
+    // Output arcs
+    for (int pos = endPosData.pos; pos > startPos; pos--) {
+      final Position posData = positions.get(pos);
+      for(int idx=0;idx<posData.count;idx++) {
+        final Position backPosData = positions.get(posData.backPos[idx]);
+        final String toNodeID = getNodeID(pos, idx);
+        final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
+
+        sb.append("  ");
+        sb.append(fromNodeID);
+        sb.append(" -> ");
+        sb.append(toNodeID);
+
+        final String attrs;
+        if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
+          // This arc is on best path
+          attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
+        } else {
+          attrs = "";
+        }
+
+        final Dictionary dict = tok.getDict(posData.backType[idx]);
+        final int wordCost = dict.getWordCost(posData.backID[idx]);
+        final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
+                                     dict.getLeftId(posData.backID[idx]));
+
+        final String surfaceForm = new String(fragment,
+                                              posData.backPos[idx] - startPos,
+                                              pos - posData.backPos[idx]);
+        
+        sb.append(" [label=\"");
+        sb.append(surfaceForm);
+        sb.append(' ');
+        sb.append(wordCost);
+        if (bgCost >= 0) {
+          sb.append('+');
+        }
+        sb.append(bgCost);
+        sb.append("\"");
+        sb.append(attrs);
+        sb.append("]\n");
+      }
+    }
+    return sb.toString();
+  }
+  
+  private String formatHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("digraph viterbi {\n");
+    sb.append("  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
+    //sb.append("  // A2 paper size\n");
+    //sb.append("  size = \"34.4,16.5\";\n");
+    //sb.append("  // try to fill paper\n");
+    //sb.append("  ratio = fill;\n");
+    sb.append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+    sb.append("  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+    
+    return sb.toString();
+  }
+  
+  private String formatTrailer() {
+    return "}";
+  }
+  
+  private String getNodeID(int pos, int idx) {
+    return pos + "." + idx;
+  }
+}

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java Thu Feb 16 17:47:06 2012
@@ -27,36 +27,26 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.util.Version;
 
 public class KuromojiAnalyzer extends StopwordAnalyzerBase {
-  private final Segmenter segmenter;
   private final Mode mode;
   private final Set<String> stoptags;
   private final UserDictionary userDict;
   
   public KuromojiAnalyzer(Version matchVersion) {
-    this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
+    this(matchVersion, null, KuromojiTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
   }
   
-  public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
-    super(matchVersion, stopwords);
-    this.segmenter = segmenter;
-    this.stoptags = stoptags;
-    userDict = null;
-    mode = Segmenter.DEFAULT_MODE;
-  }
-
   public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
     super(matchVersion, stopwords);
     this.userDict = userDict;
     this.mode = mode;
     this.stoptags = stoptags;
-    this.segmenter = null;
   }
   
   public static CharArraySet getDefaultStopSet(){
@@ -93,8 +83,7 @@ public class KuromojiAnalyzer extends St
   
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    //Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
-    Tokenizer tokenizer = new KuromojiTokenizer2(reader, userDict, true, mode);
+    Tokenizer tokenizer = new KuromojiTokenizer(reader, userDict, true, mode);
     TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
     stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
     stream = new CJKWidthFilter(stream);

Copied: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (from r1244433, lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?p2=lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java&p1=lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java&r1=1244433&r2=1245098&rev=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Thu Feb 16 17:47:06 2012
@@ -26,7 +26,6 @@ import java.util.EnumMap;
 import java.util.List;
 
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
 import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
 import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
 import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
@@ -35,7 +34,6 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -49,20 +47,27 @@ import org.apache.lucene.util.fst.FST;
 // TODO: somehow factor out a reusable viterbi search here,
 // so other decompounders/tokenizers can reuse...
 
-// nocommit add toDot and look at 1st pass intersection
-
-// nocomit explain how the 2nd best tokenization is
-// "contextual"...
+/* Uses a rolling Viterbi search to find the least cost
+ * segmentation (path) of the incoming characters.  For
+ * tokens that appear to be compound (> length 2 for all
+ * Kanji, or > length 7 for non-Kanji), we see if there is a
+ * 2nd best segmentation of that token after applying
+ * penalties to the long tokens.  If so, and the Mode is
+ * SEARCH_WITH_COMPOUND, we output the alternate
+ * segmentation as well. */
+public final class KuromojiTokenizer extends Tokenizer {
 
-// nocommit beast test random data...
+  public static enum Mode {
+    NORMAL, SEARCH, SEARCH_WITH_COMPOUNDS, EXTENDED
+  }
 
-// nocommit what default mode...?
+  public static final Mode DEFAULT_MODE = Mode.SEARCH;
 
-/* Uses a rolling Viterbi search to find the least cost
- * segmentation (path) of the incoming characters.
- *
- * @lucene.experimental */
-public final class KuromojiTokenizer2 extends Tokenizer {
+  enum Type {
+    KNOWN,
+    UNKNOWN,
+    USER
+  }
 
   private static final boolean VERBOSE = false;
 
@@ -127,7 +132,7 @@ public final class KuromojiTokenizer2 ex
   private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
   private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
 
-  public KuromojiTokenizer2(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
+  public KuromojiTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
     super(input);
     dictionary = TokenInfoDictionary.getInstance();
     fst = dictionary.getFST();
@@ -175,6 +180,14 @@ public final class KuromojiTokenizer2 ex
     dictionaryMap.put(Type.USER, userDictionary);
   }
 
+  private GraphvizFormatter dotOut;
+
+  /** Expert: set this to produce graphviz (dot) output of
+   *  the Viterbi lattice */
+  public void setGraphvizFormatter(GraphvizFormatter dotOut) {
+    this.dotOut = dotOut;
+  }
+
   @Override
   public void reset(Reader input) throws IOException {
     super.reset(input);
@@ -240,7 +253,7 @@ public final class KuromojiTokenizer2 ex
   }
 
   // Holds all back pointers arriving to this position:
-  private final static class Position {
+  final static class Position {
 
     int pos;
 
@@ -417,7 +430,7 @@ public final class KuromojiTokenizer2 ex
   // TODO: make generic'd version of this "circular array"?
   // It's a bit tricky because we do things to the Position
   // (eg, set .pos = N on reuse)...
-  private static final class WrappedPositionArray {
+  static final class WrappedPositionArray {
     private Position[] positions = new Position[8];
 
     public WrappedPositionArray() {
@@ -672,6 +685,7 @@ public final class KuromojiTokenizer2 ex
       // In the case of normal mode, it doesn't process unknown word greedily.
 
       if (!searchMode && unknownWordEndIndex > posData.pos) {
+        pos++;
         continue;
       }
 
@@ -858,6 +872,10 @@ public final class KuromojiTokenizer2 ex
 
     final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos);
 
+    if (dotOut != null) {
+      dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
+    }
+
     int pos = endPos;
     int bestIDX = fromIDX;
     Token altToken = null;
@@ -1093,7 +1111,7 @@ public final class KuromojiTokenizer2 ex
     positions.freeBefore(endPos);
   }
 
-  private Dictionary getDict(Type type) {
+  Dictionary getDict(Type type) {
     return dictionaryMap.get(type);
   }
 

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Thu Feb 16 17:47:06 2012
@@ -17,8 +17,8 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
 import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
-import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
 public class Token {
   private final Dictionary dictionary;

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestExtendedMode.java Thu Feb 16 17:47:06 2012
@@ -25,19 +25,17 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 
 public class TestExtendedMode extends BaseTokenStreamTestCase {
-  private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
   private final Analyzer analyzer = new Analyzer() {
     
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      //Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Mode.EXTENDED);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.EXTENDED);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java Thu Feb 16 17:47:06 2012
@@ -23,7 +23,7 @@ import java.io.StringReader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 
 public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiBaseFormFilter.java Thu Feb 16 17:47:06 2012
@@ -28,8 +28,7 @@ public class TestKuromojiBaseFormFilter 
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      //Tokenizer tokenizer = new KuromojiTokenizer(reader);
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Segmenter.DEFAULT_MODE);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.DEFAULT_MODE);
       return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
     }
   };

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Thu Feb 16 17:47:06 2012
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
+import java.io.PrintWriter;
 import java.io.Reader;
 import java.io.StringReader;
 
@@ -28,7 +29,8 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -58,7 +60,15 @@ public class TestKuromojiTokenizer exten
   private Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), false, Mode.SEARCH);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  private Analyzer analyzerNormal = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
@@ -66,7 +76,7 @@ public class TestKuromojiTokenizer exten
   private Analyzer analyzerNoPunct = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), true, Mode.SEARCH);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
@@ -74,7 +84,7 @@ public class TestKuromojiTokenizer exten
   private Analyzer analyzerWithCompounds = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), false, Mode.SEARCH_WITH_COMPOUNDS);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH_WITH_COMPOUNDS);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
@@ -82,11 +92,17 @@ public class TestKuromojiTokenizer exten
   private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, readDict(), true, Mode.EXTENDED);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
-  
+
+  public void testNormalMode() throws Exception {
+    assertAnalyzesTo(analyzerNormal,
+                     "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
+                     new String[] {"ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢"});
+  }
+
   public void testDecomposition1() throws Exception {
     assertAnalyzesTo(analyzerNoPunct, "æ¬æ¥ã¯ãè²§å°å±¤ã®å¥³æ§ãåä¾ã«å»çä¿è·ãæä¾ããããã«åµè¨ãããå¶åº¦ã§ããã" +
                          "ã¢ã¡ãªã«ä½æå¾èå»çæ´å©å¶åº¦ããä»æ¥ã§ã¯ããã®äºç®ã®ç´ï¼åã®ï¼ãèäººã«è²»ããã¦ããã",
@@ -312,6 +328,29 @@ public class TestKuromojiTokenizer exten
                      surfaceForms);
   }
 
+  public void testLatticeToDot() throws Exception {
+    final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+    final Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+        tokenizer.setGraphvizFormatter(gv2);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+    };
+
+    String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+    String[] surfaceForms = {
+        "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+        "ãããããã", "ã"
+    };
+    assertAnalyzesTo(analyzer,
+                     input,
+                     surfaceForms);
+    
+    assertTrue(gv2.finish().indexOf("22.0") != -1);
+  }
+
   private void assertReadings(String input, String... readings) throws IOException {
     TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
     ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Thu Feb 16 17:47:06 2012
@@ -30,7 +30,7 @@ import java.util.List;
 import java.util.zip.ZipFile;
 
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@@ -80,14 +80,15 @@ public class TestQuality extends LuceneT
     }
     //System.out.println("maxLen=" + maxLen);
 
-    final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
-    //final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
+    final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
     tokenizer.reset();
     final String all = sb.toString();
+    System.out.println("all.len=" + all.length());
     final int ITERS = 20;
     CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); 
     for(int iter=0;iter<ITERS;iter++) {
       tokenizer.reset(new StringReader(all));
+      tokenizer.reset();
       count = 0;
       long t0 = System.currentTimeMillis();
       while(tokenizer.incrementToken()) {
@@ -122,9 +123,7 @@ public class TestQuality extends LuceneT
      word agreement?: 0.999587584716181
      */
     
-    //final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
-    //final Tokenizer tokenizer = new KuromojiTokenizer(new Segmenter(Mode.NORMAL), new StringReader(""));
-    final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
+    final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""), null, true, Mode.SEARCH_WITH_COMPOUNDS);
     String line1 = null;
     String line2 = null;
     int count = 0;

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Thu Feb 16 17:47:06 2012
@@ -28,7 +28,7 @@ import java.util.Arrays;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.util.IOUtils;
 
 public class TestSearchMode extends BaseTokenStreamTestCase {
@@ -37,7 +37,7 @@ public class TestSearchMode extends Base
   private final Analyzer analyzer = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Mode.SEARCH);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
@@ -45,7 +45,7 @@ public class TestSearchMode extends Base
   private final Analyzer analyzerWithCompounds = new Analyzer() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer2(reader, null, true, Mode.SEARCH_WITH_COMPOUNDS);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH_WITH_COMPOUNDS);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Thu Feb 16 17:47:06 2012
@@ -23,29 +23,17 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.IOException;
 
-import org.apache.lucene.analysis.kuromoji.SegmenterTest;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
 
 public class UserDictionaryTest extends LuceneTestCase {
 
-  private UserDictionary readDict() throws IOException {
-    InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
-    if (is == null)
-      throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
-    try {
-      Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
-      return new UserDictionary(reader);
-    } finally {
-      is.close();
-    }
-  }
-  
   @Test
   public void testLookup() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     String s = "é¢è¥¿å½éç©ºæ¸¯ã«è¡ã£ã";
     int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
     // Length should be three é¢è¥¿, å½é, ç©ºæ¸¯
@@ -69,7 +57,7 @@ public class UserDictionaryTest extends 
   
   @Test
   public void testReadings() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdNihon = result[0][0]; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
@@ -83,7 +71,7 @@ public class UserDictionaryTest extends 
   
   @Test
   public void testPartOfSpeech() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     int[][] result = dictionary.lookup("æ¥æ¬çµæ¸æ°è".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdKeizai = result[1][0]; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
@@ -92,7 +80,7 @@ public class UserDictionaryTest extends 
   
   @Test
   public void testRead() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     assertNotNull(dictionary);		
   }
 }

Modified: lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1245098&r1=1245097&r2=1245098&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene3767/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Thu Feb 16 17:47:06 2012
@@ -28,8 +28,7 @@ import java.util.Map;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.util.IOUtils;
 import org.apache.solr.analysis.BaseTokenizerFactory;
@@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory ex
   
   @Override
   public Tokenizer create(Reader input) {
-    return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
+    return new KuromojiTokenizer(input, userDictionary, true, mode);
   }
   
   private Mode getMode(Map<String, String> args) {
@@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory ex
     if (mode != null) {
       return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
     } else {
-      return Segmenter.DEFAULT_MODE;
+      return KuromojiTokenizer.DEFAULT_MODE;
     }
   }
 }