You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/10 15:54:49 UTC

svn commit: r1299213 [2/2] - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/ lucene/contrib/analyzers/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/ luce...

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Sat Mar 10 14:54:47 2012
@@ -17,7 +17,13 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
+import java.io.BufferedReader;
+import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.io.PrintWriter;
 import java.io.Reader;
 import java.io.StringReader;
 
@@ -26,21 +32,76 @@ import org.apache.lucene.analysis.BaseTo
 import org.apache.lucene.analysis.ReusableAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 
 public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
+
+  public static UserDictionary readDict() {
+    InputStream is = TestKuromojiTokenizer.class.getResourceAsStream("userdict.txt");
+    if (is == null) {
+      throw new RuntimeException("Cannot find userdict.txt in test classpath!");
+    }
+    try {
+      try {
+        Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
+        return new UserDictionary(reader);
+      } finally {
+        is.close();
+      }
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    }
+  }
+
   private Analyzer analyzer = new ReusableAnalyzerBase() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(reader);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
-  
+
+  private Analyzer analyzerNormal = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  private Analyzer analyzerNoPunct = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  private Analyzer extendedModeAnalyzerNoPunct = new ReusableAnalyzerBase() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
+      return new TokenStreamComponents(tokenizer, tokenizer);
+    }
+  };
+
+  public void testNormalMode() throws Exception {
+    assertAnalyzesTo(analyzerNormal,
+                     "シニアソフトウェアエンジニア",
+                     new String[] {"シニアソフトウェアエンジニア"});
+  }
+
   public void testDecomposition1() throws Exception {
-    assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
+    assertAnalyzesTo(analyzerNoPunct, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
                          "アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。",
      new String[] { "本来", "は",  "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を",      
                     "提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある",  "アメリカ", 
@@ -56,7 +117,7 @@ public class TestKuromojiTokenizer exten
   }
   
   public void testDecomposition2() throws Exception {
-    assertAnalyzesTo(analyzer, "麻薬の密売は根こそぎ絶やさなければならない",
+    assertAnalyzesTo(analyzerNoPunct, "麻薬の密売は根こそぎ絶やさなければならない",
       new String[] { "麻薬", "の", "密売", "は", "根こそぎ", "絶やさ", "なけれ", "ば", "なら", "ない" },
       new int[] { 0, 2, 3, 5, 6,  10, 13, 16, 17, 19 },
       new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
@@ -64,7 +125,7 @@ public class TestKuromojiTokenizer exten
   }
   
   public void testDecomposition3() throws Exception {
-    assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。",
+    assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。",
       new String[] { "魔女", "狩", "大将", "マシュー",  "ホプキンス" },
       new int[] { 0, 2, 3, 5, 10 },
       new int[] { 2, 3, 5, 9, 15 }
@@ -92,9 +153,32 @@ public class TestKuromojiTokenizer exten
     ts.close();
   }
 
+  /*
+    // NOTE: intentionally fails!  Just trying to debug this
+    // one input...
+  public void testDecomposition6() throws Exception {
+    assertAnalyzesTo(analyzer, "奈良先端科学技術大学院大学",
+      new String[] { "これ", "は", "本", "で", "は", "ない" },
+      new int[] { 0, 2, 3, 4, 5, 6 },
+      new int[] { 2, 3, 4, 5, 6, 8 }
+                     );
+  }
+  */
+
   /** Tests that sentence offset is incorporated into the resulting offsets */
   public void testTwoSentences() throws Exception {
-    assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
+    /*
+    //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
+    TokenStream ts = analyzer.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
+    ts.reset();
+    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+    while(ts.incrementToken()) {
+      System.out.println("  " + termAtt.toString());
+    }
+    System.out.println("DONE PARSE\n\n");
+    */
+
+    assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
       new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス",  "魔女", "狩", "大将", "マシュー",  "ホプキンス"  },
       new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
       new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
@@ -104,6 +188,7 @@ public class TestKuromojiTokenizer exten
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER);
   }
   
   public void testLargeDocReliability() throws Exception {
@@ -126,6 +211,9 @@ public class TestKuromojiTokenizer exten
   public void testSurrogates2() throws IOException {
     int numIterations = atLeast(10000);
     for (int i = 0; i < numIterations; i++) {
+      if (VERBOSE) {
+        System.out.println("\nTEST: iter=" + i);
+      }
       String s = _TestUtil.randomUnicodeString(random, 100);
       TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
@@ -135,22 +223,410 @@ public class TestKuromojiTokenizer exten
       }
     }
   }
+
+  public void testOnlyPunctuation() throws IOException {
+    TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("。、。。"));
+    ts.reset();
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  public void testOnlyPunctuationExtended() throws IOException {
+    TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
+    ts.reset();
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
   
   // note: test is kinda silly since kuromoji emits punctuation tokens.
   // but, when/if we filter these out it will be useful.
   public void testEnd() throws Exception {
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない")),
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない")),
         new String[] { "これ", "は", "本", "で", "は", "ない" },
         new int[] { 0, 2, 3, 4, 5, 6 },
         new int[] { 2, 3, 4, 5, 6, 8 },
         new Integer(8)
     );
-    
-    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない    ")),
+
+    assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない    ")),
         new String[] { "これ", "は", "本", "で", "は", "ない"  },
         new int[] { 0, 2, 3, 4, 5, 6, 8 },
         new int[] { 2, 3, 4, 5, 6, 8, 9 },
         new Integer(12)
     );
   }
+
+  public void testUserDict() throws Exception {
+    // Not a great test because w/o userdict.txt the
+    // segmentation is the same:
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("関西国際空港に行った")),
+                              new String[] { "関西", "国際", "空港", "に", "行っ", "た"  },
+                              new int[] { 0, 2, 4, 6, 7, 9 },
+                              new int[] { 2, 4, 6, 7, 9, 10 },
+                              new Integer(10)
+    );
+  }
+
+  public void testUserDict2() throws Exception {
+    // Better test: w/o userdict the segmentation is different:
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("朝青龍")),
+                              new String[] { "朝青龍"  },
+                              new int[] { 0 },
+                              new int[] { 3 },
+                              new Integer(3)
+    );
+  }
+
+  public void testUserDict3() throws Exception {
+    // Test entry that breaks into multiple tokens:
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
+                              new String[] { "a", "b", "cd"  },
+                              new int[] { 0, 1, 2 },
+                              new int[] { 1, 2, 4 },
+                              new Integer(4)
+    );
+  }
+
+  // HMM: fails (segments as a/b/cd/efghij)... because the
+  // two paths have exactly equal paths (1 KNOWN + 1
+  // UNKNOWN) and we don't seem to favor longer KNOWN /
+  // shorter UNKNOWN matches:
+
+  /*
+  public void testUserDict4() throws Exception {
+    // Test entry that has another entry as prefix
+    assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
+                              new String[] { "ab", "cd", "efg", "hij"  },
+                              new int[] { 0, 2, 4, 7 },
+                              new int[] { 2, 4, 7, 10 },
+                              new Integer(10)
+    );
+  }
+  */
+  
+  public void testSegmentation() throws Exception {
+    // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
+    //		String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
+    //		String[] surfaceForms = {
+    //				"ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
+    //				"スペース", "ステーション", "に", "行き", "ます", "。",
+    //				"うたがわしい", "。"
+    //		};
+    String input = "スペースステーションに行きます。うたがわしい。";
+    String[] surfaceForms = {
+        "スペース", "ステーション", "に", "行き", "ます", "。",
+        "うたがわしい", "。"
+    };
+    assertAnalyzesTo(analyzer,
+                     input,
+                     surfaceForms);
+  }
+
+  public void testLatticeToDot() throws Exception {
+    final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
+    final Analyzer analyzer = new ReusableAnalyzerBase() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
+        tokenizer.setGraphvizFormatter(gv2);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+    };
+
+    String input = "スペースステーションに行きます。うたがわしい。";
+    String[] surfaceForms = {
+        "スペース", "ステーション", "に", "行き", "ます", "。",
+        "うたがわしい", "。"
+    };
+    assertAnalyzesTo(analyzer,
+                     input,
+                     surfaceForms);
+    
+    assertTrue(gv2.finish().indexOf("22.0") != -1);
+  }
+
+  private void assertReadings(String input, String... readings) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
+    ts.reset();
+    for(String reading : readings) {
+      assertTrue(ts.incrementToken());
+      assertEquals(reading, readingAtt.getReading());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  private void assertPronunciations(String input, String... pronunciations) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
+    ts.reset();
+    for(String pronunciation : pronunciations) {
+      assertTrue(ts.incrementToken());
+      assertEquals(pronunciation, readingAtt.getPronunciation());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+  
+  private void assertBaseForms(String input, String... baseForms) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
+    ts.reset();
+    for(String baseForm : baseForms) {
+      assertTrue(ts.incrementToken());
+      assertEquals(baseForm, baseFormAtt.getBaseForm());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
+    ts.reset();
+    for(String inflectionType : inflectionTypes) {
+      assertTrue(ts.incrementToken());
+      assertEquals(inflectionType, inflectionAtt.getInflectionType());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+
+  private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
+    ts.reset();
+    for(String inflectionForm : inflectionForms) {
+      assertTrue(ts.incrementToken());
+      assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+  
+  private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
+    TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
+    PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
+    ts.reset();
+    for(String partOfSpeech : partsOfSpeech) {
+      assertTrue(ts.incrementToken());
+      assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
+    }
+    assertFalse(ts.incrementToken());
+    ts.end();
+  }
+  
+  public void testReadings() throws Exception {
+    assertReadings("寿司が食べたいです。",
+                   "スシ",
+                   "ガ",
+                   "タベ",
+                   "タイ",
+                   "デス",
+                   "。");
+  }
+  
+  public void testReadings2() throws Exception {
+    assertReadings("多くの学生が試験に落ちた。",
+                   "オオク",
+                   "ノ",
+                   "ガクセイ",
+                   "ガ",
+                   "シケン",
+                   "ニ",
+                   "オチ",
+                   "タ",
+                   "。");
+  }
+  
+  public void testPronunciations() throws Exception {
+    assertPronunciations("寿司が食べたいです。",
+                         "スシ",
+                         "ガ",
+                         "タベ",
+                         "タイ",
+                         "デス",
+                         "。");
+  }
+  
+  public void testPronunciations2() throws Exception {
+    // pronunciation differs from reading here
+    assertPronunciations("多くの学生が試験に落ちた。",
+                         "オーク",
+                         "ノ",
+                         "ガクセイ",
+                         "ガ",
+                         "シケン",
+                         "ニ",
+                         "オチ",
+                         "タ",
+                         "。");
+  }
+  
+  public void testBasicForms() throws Exception {
+    assertBaseForms("それはまだ実験段階にあります。",
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    "ある",
+                    null,
+                    null);
+  }
+  
+  public void testInflectionTypes() throws Exception {
+    assertInflectionTypes("それはまだ実験段階にあります。",
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          "五段・ラ行",
+                          "特殊・マス",
+                          null);
+  }
+  
+  public void testInflectionForms() throws Exception {
+    assertInflectionForms("それはまだ実験段階にあります。",
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          null,
+                          "連用形",
+                          "基本形",
+                          null);
+  }
+  
+  public void testPartOfSpeech() throws Exception {
+    assertPartsOfSpeech("それはまだ実験段階にあります。",
+                        "名詞-代名詞-一般",
+                        "助詞-係助詞",
+                        "副詞-助詞類接続",
+                        "名詞-サ変接続",
+                        "名詞-一般",
+                        "助詞-格助詞-一般",
+                        "動詞-自立",
+                        "助動詞",
+                        "記号-句点");
+  }
+
+  // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
+  // do we have a possibility to actually lookup the first and last word from dictionary?
+  public void testYabottai() throws Exception {
+    assertAnalyzesTo(analyzer, "やぼったい",
+                     new String[] {"やぼったい"});
+  }
+
+  public void testTsukitosha() throws Exception {
+    assertAnalyzesTo(analyzer, "突き通しゃ",
+                     new String[] {"突き通しゃ"});
+  }
+
+  public void testBocchan() throws Exception {
+    doTestBocchan(1);
+  }
+
+  @Nightly
+  public void testBocchanBig() throws Exception {
+    doTestBocchan(100);
+  }
+
+  /*
+  public void testWikipedia() throws Exception {
+    final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
+    final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
+
+    final long startTimeNS = System.nanoTime();
+    boolean done = false;
+    long compoundCount = 0;
+    long nonCompoundCount = 0;
+    long netOffset = 0;
+    while (!done) {
+      final TokenStream ts = analyzer.tokenStream("ignored", r);
+      ts.reset();
+      final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
+      final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+      int count = 0;
+      while (true) {
+        if (!ts.incrementToken()) {
+          done = true;
+          break;
+        }
+        count++;
+        if (posIncAtt.getPositionIncrement() == 0) {
+          compoundCount++;
+        } else {
+          nonCompoundCount++;
+          if (nonCompoundCount % 1000000 == 0) {
+            System.out.println(String.format("%.2f msec [pos=%d, %d, %d]",
+                                             (System.nanoTime()-startTimeNS)/1000000.0,
+                                             netOffset + offsetAtt.startOffset(),
+                                             nonCompoundCount,
+                                             compoundCount));
+          }
+        }
+        if (count == 100000000) {
+          System.out.println("  again...");
+          break;
+        }
+      }
+      ts.end();
+      netOffset += offsetAtt.endOffset();
+    }
+    System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount);
+    r.close();
+  }
+  */
+
+  
+  private void doTestBocchan(int numIterations) throws Exception {
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(
+        this.getClass().getResourceAsStream("bocchan.utf-8")));
+    String line = reader.readLine();
+    reader.close();
+    
+    if (VERBOSE) {
+      System.out.println("Test for Bocchan without pre-splitting sentences");
+    }
+
+    /*
+    if (numIterations > 1) {
+      // warmup
+      for (int i = 0; i < numIterations; i++) {
+        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+        ts.reset();
+        while(ts.incrementToken());
+      }
+    }
+    */
+
+    long totalStart = System.currentTimeMillis();
+    for (int i = 0; i < numIterations; i++) {
+      final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
+      ts.reset();
+      while(ts.incrementToken());
+    }
+    String[] sentences = line.split("、|。");
+    if (VERBOSE) {
+      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
+      System.out.println("Test for Bocchan with pre-splitting sentences (" + sentences.length + " sentences)");
+    }
+    totalStart = System.currentTimeMillis();
+    for (int i = 0; i < numIterations; i++) {
+      for (String sentence: sentences) {
+        final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
+        ts.reset();
+        while(ts.incrementToken());
+      }
+    }
+    if (VERBOSE) {
+      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
+    }
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestSearchMode.java Sat Mar 10 14:54:47 2012
@@ -28,20 +28,19 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.ReusableAnalyzerBase;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.util.IOUtils;
 
 public class TestSearchMode extends BaseTokenStreamTestCase {
   private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
-  private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
   private final Analyzer analyzer = new ReusableAnalyzerBase() {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
+      Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
       return new TokenStreamComponents(tokenizer, tokenizer);
     }
   };
-  
+
   /** Test search mode segmentation */
   public void testSearchSegmentation() throws IOException {
     InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME);
@@ -64,7 +63,18 @@ public class TestSearchMode extends Base
         String[] fields = line.split("\t", 2);
         String sourceText = fields[0];
         String[] expectedTokens = fields[1].split("\\s+");
-        assertAnalyzesTo(analyzer, sourceText, expectedTokens);
+        int[] expectedPosIncrs = new int[expectedTokens.length];
+        int[] expectedPosLengths = new int[expectedTokens.length];
+        for(int tokIDX=0;tokIDX<expectedTokens.length;tokIDX++) {
+          if (expectedTokens[tokIDX].endsWith("/0")) {
+            expectedTokens[tokIDX] = expectedTokens[tokIDX].replace("/0", "");
+            expectedPosLengths[tokIDX] = expectedTokens.length-1;
+          } else {
+            expectedPosIncrs[tokIDX] = 1;
+            expectedPosLengths[tokIDX] = 1;
+          }
+        }
+        assertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs);
       }
     } finally {
       is.close();

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Sat Mar 10 14:54:47 2012
@@ -23,29 +23,17 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.IOException;
 
-import org.apache.lucene.analysis.kuromoji.SegmenterTest;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
 
 public class UserDictionaryTest extends LuceneTestCase {
 
-  private UserDictionary readDict() throws IOException {
-    InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
-    if (is == null)
-      throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
-    try {
-      Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
-      return new UserDictionary(reader);
-    } finally {
-      is.close();
-    }
-  }
-  
   @Test
   public void testLookup() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     String s = "関西国際空港に行った";
     int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
     // Length should be three 関西, 国際, 空港
@@ -69,7 +57,7 @@ public class UserDictionaryTest extends 
   
   @Test
   public void testReadings() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
@@ -83,7 +71,7 @@ public class UserDictionaryTest extends 
   
   @Test
   public void testPartOfSpeech() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞
@@ -92,7 +80,7 @@ public class UserDictionaryTest extends 
   
   @Test
   public void testRead() throws IOException {
-    UserDictionary dictionary = readDict();
+    UserDictionary dictionary = TestKuromojiTokenizer.readDict();
     assertNotNull(dictionary);		
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/search-segmentation-tests.txt Sat Mar 10 14:54:47 2012
@@ -25,43 +25,45 @@
 ##
 
 # Kansai Internationl Airport
-関西国際空港	関西 国際 空港
+関西国際空港	関西 関西国際空港/0 国際 空港
 # Narita Airport
-成田空港	成田 空港
+成田空港	成田 成田空港/0 空港
 # Haneda Airport
-羽田空港	羽田 空港
+羽田空港	羽田 羽田空港/0 空港
 # Nara Institute of Science and Technology
-奈良先端科学技術大学院大学	奈良 先端 科学 技術 大学院 大学
+奈良先端科学技術大学院大学	奈良 奈良先端科学技術大学院大学/0 先端 科学 技術 大学院 大学
 # Tokyo University
-東京大学	東京 大学
+東京大学	東京 東京大学/0 大学
 # Kyoto University
-京都大学	京都 大学
+京都大学	京都 京都大学/0 大学
+
+# NOTE: differs from non-compound mode:
 # Kyoto University Baseball Club
-京都大学硬式野球部	京都 大学 硬式 野球 部
+京都大学硬式野球部	京都大 学 硬式 野球 部
 
 ##
 ## Katakana titles
 ##
 
 # Senior Software Engineer
-シニアソフトウェアエンジニア	シニア ソフトウェア エンジニア
+シニアソフトウェアエンジニア	シニア シニアソフトウェアエンジニア/0 ソフトウェア エンジニア
 # Software Engineer
 ソフトウェアエンジニア	ソフトウェア エンジニア
 # Senior Project Manager
-シニアプロジェクトマネジャー	シニア プロジェクト マネジャー
+シニアプロジェクトマネジャー	シニア シニアプロジェクトマネジャー/0 プロジェクト マネジャー
 # Project Manager
 プロジェクトマネジャー	プロジェクト マネジャー
 # Senior Sales Engineer
-シニアセールスエンジニア	シニア セールス エンジニア
+シニアセールスエンジニア	シニア シニアセールスエンジニア/0 セールス エンジニア
 # System Architect
-システムアーキテクト	システム アーキテクト
+システムアーキテクト	システム システムアーキテクト/0 アーキテクト
 # Senior System Architect
-シニアシステムアーキテクト	シニア システム アーキテクト
+シニアシステムアーキテクト	シニア シニアシステムアーキテクト/0 システム アーキテクト
 # System Administrator
 システムアドミニストレータ	システム アドミニストレータ
-システムアドミニストレーター	システム アドミニストレーター
+システムアドミニストレーター	システム システムアドミニストレーター/0 アドミニストレーター
 # Senior System Administrator
-シニアシステムアドミニストレーター	シニア システム アドミニストレーター
+シニアシステムアドミニストレーター	シニア シニアシステムアドミニストレーター/0 システム アドミニストレーター
 
 ##
 ## Company names (several are fictitious)
@@ -70,25 +72,25 @@
 # SoftBank Mobile
 ソフトバンクモバイル	ソフトバンク モバイル
 # Alpine Materials
-アルパインマテリアルズ	アルパイン マテリアルズ
+アルパインマテリアルズ	アルパイン アルパインマテリアルズ/0 マテリアルズ
 # Sapporo Holdings
 サッポロホールディングス	サッポロ ホールディングス
 # Yamada Corporation
-ヤマダコーポレーション	ヤマダ コーポレーション
+ヤマダコーポレーション	ヤマダ ヤマダコーポレーション/0 コーポレーション
 # Canon Semiconductor equipement	NOTE: Semiconductor becomes semi + conductor
-キヤノンセミコンダクターエクィップメント	キヤノン セミ コンダクター エクィップメント
+キヤノンセミコンダクターエクィップメント	キヤノン キヤノンセミコンダクターエクィップメント/0 セミ コンダクター エクィップメント
 # Orental Chain
-オリエンタルチエン	オリエンタル チエン
+オリエンタルチエン	オリエンタル オリエンタルチエン/0 チエン
 # Ally Projects Japan	NOTE: Becomes one token as プロジェクツ is not in IPADIC
 アーリープロジェクツジャパン	アーリープロジェクツジャパン
 # Peter Pan Corporation
-ピーターパンコーポレーション	ピーター パン コーポレーション
+ピーターパンコーポレーション	ピーター ピーターパンコーポレーション/0 パン コーポレーション
 # AIM Create
 エイムクリエイツ	エイムクリエイツ
 # Mars Engineering
-マースエンジニアリング	マース エンジニアリング
+マースエンジニアリング	マース マースエンジニアリング/0 エンジニアリング
 # Fuji Protein Technology
-フジプロテインテクノロジー	フジ プロテイン テクノロジー
+フジプロテインテクノロジー	フジ フジプロテインテクノロジー/0 プロテイン テクノロジー
 
 ##
 ## Person names
@@ -100,7 +102,7 @@
 スティーブジョブズ	スティーブ ジョブズ
 # Harry Potter	NOTE: Becomes one token (short word)
 ハリーポッター	ハリーポッター
-# Bill Gates	NOTE: Becomes one token (short work)
+# Bill Gates	NOTE: Becomes one token (short word)
 ビルゲイツ	ビルゲイツ
 # Sean Connery	NOTE: Becomes one token (okay)
 ショーンコネリー	ショーンコネリー
@@ -133,8 +135,8 @@
 ##
 
 # JT Engineering	NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
-ジェイティエンジニアリング	ジェイ ティエン ジニア リング
+ジェイティエンジニアリング	ジェイ ジェイティエンジニアリング/0 ティエン ジニア リング
 # Anchovy pasta	NOTE: Become Anch yvipasta
-アンチョビパスタ	アンチ ョビパスタ
+アンチョビパスタ	アンチ アンチョビパスタ/0 ョビパスタ
 # Surprise gift	NOTE: Becomes one token (surprise not in IPADIC)
 サプライズギフト	サプライズギフト

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/userdict.txt Sat Mar 10 14:54:47 2012
@@ -4,3 +4,7 @@
 
 # Custom reading for sumo wrestler
 朝青龍,朝青龍,アサショウリュウ,カスタム人名
+
+# Silly entry:
+abcd,a b cd,foo1 foo2 foo3,bar
+abcdefg,ab cd efg,foo1 foo2 foo4,bar

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java Sat Mar 10 14:54:47 2012
@@ -54,9 +54,10 @@ public class PositionIncrementAttributeI
    * @param positionIncrement the distance from the prior term
    */
   public void setPositionIncrement(int positionIncrement) {
-    if (positionIncrement < 0)
+    if (positionIncrement < 0) {
       throw new IllegalArgumentException
-        ("Increment must be zero or greater: " + positionIncrement);
+        ("Increment must be zero or greater: got " + positionIncrement);
+    }
     this.positionIncrement = positionIncrement;
   }
 
@@ -79,7 +80,8 @@ public class PositionIncrementAttributeI
     }
     
     if (other instanceof PositionIncrementAttributeImpl) {
-      return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
+      PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
+      return positionIncrement ==  _other.positionIncrement;
     }
  
     return false;
@@ -95,5 +97,4 @@ public class PositionIncrementAttributeI
     PositionIncrementAttribute t = (PositionIncrementAttribute) target;
     t.setPositionIncrement(positionIncrement);
   }  
-
 }

Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Sat Mar 10 14:54:47 2012
@@ -840,6 +840,7 @@ public final class FST<T> {
   }
 
   public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException {
+    assert in.bytes == bytes;
     final int address = getNodeAddress(node);
     in.pos = address;
     //System.out.println("  readFirstRealTargtArc address="
@@ -936,6 +937,7 @@ public final class FST<T> {
   /** Never returns null, but you should never call this if
    *  arc.isLast() is true. */
   public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
+    assert in.bytes == bytes;
 
     // TODO: can't assert this because we call from readFirstArc
     // assert !flag(arc.flags, BIT_LAST_ARC);
@@ -1019,6 +1021,7 @@ public final class FST<T> {
    *  This returns null if the arc was not found, else the incoming arc. */
   public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
     assert cachedRootArcs != null;
+    assert in.bytes == bytes;
 
     if (labelToMatch == END_LABEL) {
       if (follow.isFinal()) {
@@ -1225,17 +1228,20 @@ public final class FST<T> {
 
   /** Expert */
   public static abstract class BytesReader extends DataInput {
-    int pos;
+    protected int pos;
+    protected final byte[] bytes;
+    protected BytesReader(byte[] bytes, int pos) {
+      this.bytes = bytes;
+      this.pos = pos;
+    }
     abstract void skip(int byteCount);
     abstract void skip(int base, int byteCount);
   }
 
   final static class ReverseBytesReader extends BytesReader {
-    final byte[] bytes;
 
     public ReverseBytesReader(byte[] bytes, int pos) {
-      this.bytes = bytes;
-      this.pos = pos;
+      super(bytes, pos);
     }
 
     @Override
@@ -1262,11 +1268,9 @@ public final class FST<T> {
   // TODO: can we use just ByteArrayDataInput...?  need to
   // add a .skipBytes to DataInput.. hmm and .setPosition
   final static class ForwardBytesReader extends BytesReader {
-    final byte[] bytes;
 
     public ForwardBytesReader(byte[] bytes, int pos) {
-      this.bytes = bytes;
-      this.pos = pos;
+      super(bytes, pos);
     }
 
     @Override

Modified: lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Sat Mar 10 14:54:47 2012
@@ -17,13 +17,18 @@ package org.apache.lucene.analysis;
  * limitations under the License.
  */
 
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
 import java.io.Reader;
 import java.io.StringReader;
-import java.io.IOException;
+import java.io.StringWriter;
+import java.io.Writer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
- 
+
 import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeImpl;
@@ -83,7 +88,7 @@ public abstract class BaseTokenStreamTes
     }
   }
 
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
     assertNotNull(output);
     CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
     
@@ -107,6 +112,12 @@ public abstract class BaseTokenStreamTes
       assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
       posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
     }
+
+    PositionLengthAttribute posLengthAtt = null;
+    if (posLengths != null) {
+      assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
+      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
+    }
     
     ts.reset();
     for (int i = 0; i < output.length; i++) {
@@ -116,6 +127,7 @@ public abstract class BaseTokenStreamTes
       if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
       if (typeAtt != null) typeAtt.setType("bogusType");
       if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
+      if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
       
       checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
       assertTrue("token "+i+" does not exist", ts.incrementToken());
@@ -130,6 +142,8 @@ public abstract class BaseTokenStreamTes
         assertEquals("type "+i, types[i], typeAtt.type());
       if (posIncrements != null)
         assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
+      if (posLengths != null)
+        assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
       
       // we can enforce some basic things about a few attributes even if the caller doesn't check:
       if (offsetAtt != null) {
@@ -138,14 +152,18 @@ public abstract class BaseTokenStreamTes
         assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
         if (finalOffset != null) {
           assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
-          assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
+          assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
+                     offsetAtt.endOffset() <= finalOffset.intValue());
         }
       }
       if (posIncrAtt != null) {
         assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
       }
+      if (posLengthAtt != null) {
+        assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
+      }
     }
-    assertFalse("end of stream", ts.incrementToken());
+    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
     ts.end();
     if (finalOffset != null)
       assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@@ -155,65 +173,81 @@ public abstract class BaseTokenStreamTes
     ts.close();
   }
   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
+  }
+
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
   }
 
   public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
-    assertTokenStreamContents(ts, output, null, null, null, null, null);
+    assertTokenStreamContents(ts, output, null, null, null, null, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
-    assertTokenStreamContents(ts, output, null, null, types, null, null);
+    assertTokenStreamContents(ts, output, null, null, types, null, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
-    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
+    assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
   }
   
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
   }
 
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
-    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
+  }
+  
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
-    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
+  }
+  
+  public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
-    assertAnalyzesTo(a, input, output, null, null, null, null);
+    assertAnalyzesTo(a, input, output, null, null, null, null, null);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
-    assertAnalyzesTo(a, input, output, null, null, types, null);
+    assertAnalyzesTo(a, input, output, null, null, types, null, null);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
-    assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
+    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
+  }
+
+  public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
+    assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
-    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
+    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
   }
   
   public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
-    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
+    assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
   }
   
 
   public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
-    assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
+    assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
   }
   
   public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
@@ -326,7 +360,7 @@ public abstract class BaseTokenStreamTes
       }
 
       if (VERBOSE) {
-        System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
+        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
       }
 
       int remainder = random.nextInt(10);
@@ -336,10 +370,12 @@ public abstract class BaseTokenStreamTes
       CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
       OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
       PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
+      PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
       TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
       List<String> tokens = new ArrayList<String>();
       List<String> types = new ArrayList<String>();
       List<Integer> positions = new ArrayList<Integer>();
+      List<Integer> positionLengths = new ArrayList<Integer>();
       List<Integer> startOffsets = new ArrayList<Integer>();
       List<Integer> endOffsets = new ArrayList<Integer>();
       ts.reset();
@@ -347,6 +383,7 @@ public abstract class BaseTokenStreamTes
         tokens.add(termAtt.toString());
         if (typeAtt != null) types.add(typeAtt.type());
         if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
+        if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
         if (offsetAtt != null) {
           startOffsets.add(offsetAtt.startOffset());
           endOffsets.add(offsetAtt.endOffset());
@@ -357,11 +394,21 @@ public abstract class BaseTokenStreamTes
       // verify reusing is "reproducable" and also get the normal tokenstream sanity checks
       if (!tokens.isEmpty()) {
         if (VERBOSE) {
-          System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
+          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
         }
         reader = new StringReader(text);
         ts = a.reusableTokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-        if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+        if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+          // offset + pos + posLength + type
+          assertTokenStreamContents(ts, 
+            tokens.toArray(new String[tokens.size()]),
+            toIntArray(startOffsets),
+            toIntArray(endOffsets),
+            types.toArray(new String[types.size()]),
+            toIntArray(positions),
+            toIntArray(positionLengths),
+            text.length());
+        } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
           // offset + pos + type
           assertTokenStreamContents(ts, 
             tokens.toArray(new String[tokens.size()]),
@@ -369,7 +416,18 @@ public abstract class BaseTokenStreamTes
             toIntArray(endOffsets),
             types.toArray(new String[types.size()]),
             toIntArray(positions),
+            null,
             text.length());
+        } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+          // offset + pos + posLength
+          assertTokenStreamContents(ts, 
+              tokens.toArray(new String[tokens.size()]),
+              toIntArray(startOffsets),
+              toIntArray(endOffsets),
+              null,
+              toIntArray(positions),
+              toIntArray(positionLengths),
+              text.length());
         } else if (posIncAtt != null && offsetAtt != null) {
           // offset + pos
           assertTokenStreamContents(ts, 
@@ -378,6 +436,7 @@ public abstract class BaseTokenStreamTes
               toIntArray(endOffsets),
               null,
               toIntArray(positions),
+              null,
               text.length());
         } else if (offsetAtt != null) {
           // offset
@@ -387,6 +446,7 @@ public abstract class BaseTokenStreamTes
               toIntArray(endOffsets),
               null,
               null,
+              null,
               text.length());
         } else {
           // terms only
@@ -396,6 +456,22 @@ public abstract class BaseTokenStreamTes
       }
     }
   }
+
+  protected String toDot(Analyzer a, String inputText) throws IOException {
+    final StringWriter sw = new StringWriter();
+    final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
+    ts.reset();
+    new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
+    return sw.toString();
+  }
+
+  protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
+    Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8");
+    final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
+    ts.reset();
+    new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
+    w.close();
+  }
   
   static int[] toIntArray(List<Integer> list) {
     int ret[] = new int[list.size()];

Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/KuromojiTokenizerFactory.java Sat Mar 10 14:54:47 2012
@@ -28,8 +28,7 @@ import java.util.Map;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
-import org.apache.lucene.analysis.kuromoji.Segmenter;
-import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.util.IOUtils;
 import org.apache.solr.analysis.BaseTokenizerFactory;
@@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory ex
   
   //@Override
   public Tokenizer create(Reader input) {
-    return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
+    return new KuromojiTokenizer(input, userDictionary, true, mode);
   }
   
   private Mode getMode(Map<String, String> args) {
@@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory ex
     if (mode != null) {
       return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
     } else {
-      return Segmenter.DEFAULT_MODE;
+      return KuromojiTokenizer.DEFAULT_MODE;
     }
   }
 }

Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java?rev=1299213&r1=1299212&r2=1299213&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestKuromojiTokenizerFactory.java Sat Mar 10 14:54:47 2012
@@ -50,7 +50,7 @@ public class TestKuromojiTokenizerFactor
     factory.inform(new SolrResourceLoader(null, null));
     TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア"));
     assertTokenStreamContents(ts,
-        new String[] { "シニア", "ソフトウェア", "エンジニア" }
+                              new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
     );
   }