You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/06 19:37:10 UTC

svn commit: r1228334 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucene/anal...

Author: rmuir
Date: Fri Jan  6 18:37:10 2012
New Revision: 1228334

URL: http://svn.apache.org/viewvc?rev=1228334&view=rev
Log:
LUCENE-3305: speed up tokenization by not creating strings

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java Fri Jan  6 18:37:10 2012
@@ -49,7 +49,7 @@ public class DebugTokenizer {
   }
   
   public String debugTokenize(String text) {
-    ViterbiNode[][][] lattice = this.viterbi.build(text);
+    ViterbiNode[][][] lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
     List<ViterbiNode> bestPath = this.viterbi.search(lattice);
     return this.formatter.format(lattice[0], lattice[1], bestPath);
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Fri Jan  6 18:37:10 2012
@@ -48,8 +48,8 @@ public final class KuromojiTokenizer ext
   @Override
   protected void setNextSentence(int sentenceStart, int sentenceEnd) {
     this.sentenceStart = sentenceStart;
-    // TODO: allow the tokenizer, at least maybe doTokenize to take char[] or charsequence or characteriterator?
-    tokens = tokenizer.tokenize(new String(buffer, sentenceStart, sentenceEnd-sentenceStart));
+    // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
+    tokens = tokenizer.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart);
     tokenIndex = 0;
   }
 
@@ -59,10 +59,8 @@ public final class KuromojiTokenizer ext
       return false;
     }
     Token token = tokens.get(tokenIndex);
-    // TODO: we don't really need the surface form except for its length? (its in the buffer already)
-    String surfaceForm = token.getSurfaceForm();
     int position = token.getPosition();
-    int length = surfaceForm.length();
+    int length = token.getLength();
     clearAttributes();
     termAtt.copyBuffer(buffer, sentenceStart + position, length);
     int startOffset = offset + sentenceStart + position;

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Fri Jan  6 18:37:10 2012
@@ -20,22 +20,24 @@ package org.apache.lucene.analysis.kurom
 import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
-// TODO: somehow this thing needs to keep state, so that once it decodes metadata
-// it never does it again.
 public class Token {
   private final Dictionary dictionary;
   
   private final int wordId;
   
-  private final String surfaceForm;
+  private final char[] surfaceForm;
+  private final int offset;
+  private final int length;
   
   private final int position;
   
   private final Type type;
   
-  public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
+  public Token(int wordId, char[] surfaceForm, int offset, int length, Type type, int position, Dictionary dictionary) {
     this.wordId = wordId;
     this.surfaceForm = surfaceForm;
+    this.offset = offset;
+    this.length = length;
     this.type = type;
     this.position = position;
     this.dictionary = dictionary;
@@ -44,11 +46,32 @@ public class Token {
   /**
    * @return surfaceForm
    */
-  public String getSurfaceForm() {
+  public char[] getSurfaceForm() {
     return surfaceForm;
   }
   
   /**
+   * @return offset into surfaceForm
+   */
+  public int getOffset() {
+    return offset;
+  }
+  
+  /**
+   * @return length of surfaceForm
+   */
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * @return surfaceForm as a String
+   */
+  public String getSurfaceFormString() {
+    return new String(surfaceForm, offset, length);
+  }
+  
+  /**
    * @return reading. null if token doesn't have reading.
    */
   public String getReading() {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Fri Jan  6 18:37:10 2012
@@ -129,23 +129,28 @@ public class Tokenizer {
     return splitPositions;
   }
   
+  private List<Token> doTokenize(int offset, String sentence) {
+    char text[] = sentence.toCharArray();
+    return doTokenize(offset, text, 0, text.length);
+  }
+  
   /**
    * Tokenize input sentence.
    * @param offset offset of sentence in original input text
    * @param sentence sentence to tokenize
    * @return list of Token
    */
-  private List<Token> doTokenize(int offset, String sentence) {
+  public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength) {
     ArrayList<Token> result = new ArrayList<Token>();
     
-    ViterbiNode[][][] lattice = viterbi.build(sentence);
+    ViterbiNode[][][] lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
     List<ViterbiNode> bestPath = viterbi.search(lattice);
     for (ViterbiNode node : bestPath) {
       int wordId = node.getWordId();
       if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS 
         continue;
       }
-      Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
+      Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
       result.add(token);
     }
     
@@ -167,7 +172,9 @@ public class Tokenizer {
     
     private Mode mode = Mode.NORMAL;
     
-    private boolean split = false;
+    // this is true, for other use.
+    // lucene's tokenizer uses a breakiterator and doTokenize directly.
+    private boolean split = true;
     
     private UserDictionary userDictionary = null;
     

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Fri Jan  6 18:37:10 2012
@@ -27,16 +27,16 @@ public final class UnknownDictionary ext
     super();
   }
   
-  public int lookup(String text) {
-    if(!characterDefinition.isGroup(text.charAt(0))) {
+  public int lookup(char[] text, int offset, int len) {
+    if(!characterDefinition.isGroup(text[offset])) {
       return 1;
     }
     
     // Extract unknown word. Characters with the same character class are considered to be part of unknown word
-    byte characterIdOfFirstCharacter = characterDefinition.getCharacterClass(text.charAt(0));
+    byte characterIdOfFirstCharacter = characterDefinition.getCharacterClass(text[offset]);
     int length = 1;
-    for (int i = 1; i < text.length(); i++) {
-      if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text.charAt(i))){
+    for (int i = 1; i < len; i++) {
+      if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text[offset+i])){
         length++;    			
       } else {
         break;

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Fri Jan  6 18:37:10 2012
@@ -49,10 +49,14 @@ public class UserDictionary implements D
   
   /**
    * Lookup words in text
-   * @param text
+   * @param chars text
+   * @param off offset into text
+   * @param len length of text
    * @return array of {wordId, position, length}
    */
-  public int[][] lookup(String text) {
+  public int[][] lookup(char[] chars, int off, int len) {
+    // TODO: this method should be more efficient.
+    String text = new String(chars, off, len);
     TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
     
     for (String keyword : entries.descendingKeySet()) {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java Fri Jan  6 18:37:10 2012
@@ -180,14 +180,14 @@ public final class DoubleArrayTrie {
    * @param key key to match
    * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
    */
-  public int lookup(String key) {
+  public int lookup(char key[], int offset, int length) {
     int index = 0;
     int base = 1; // base at index 0 should be 1
     
-    int keyLength = key.length();
-    for(int i = 0; i < keyLength; i++) {
+    int end = offset + length;
+    for(int i = offset; i < end; i++) {
       int previous = index;
-      index = index + base + key.charAt(i);
+      index = index + base + key[i];
       
       if(index > baseBuffer.limit()) { // Too long
         return -1;
@@ -204,7 +204,8 @@ public final class DoubleArrayTrie {
       }
       
       if(base >= TAIL_OFFSET) {	// If base is bigger than TAIL_OFFSET, start processing "tail"
-        return matchTail(base, index, key.substring(i + 1));
+        int newOffset = i + 1;
+        return matchTail(base, index, key, newOffset, end - newOffset);
       }
       
     }
@@ -222,16 +223,15 @@ public final class DoubleArrayTrie {
    * @param key
    * @return	index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
    */
-  private int matchTail(int base, int index, String key) {
+  private int matchTail(int base, int index, char key[], int offset, int length) {
     int positionInTailArr = base - TAIL_OFFSET;
     
-    int keyLength = key.length();
-    for(int i = 0; i < keyLength; i++) {
-      if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
+    for(int i = 0; i < length; i++) {
+      if(key[offset + i] != tailBuffer.get(positionInTailArr + i)){
         return -1;
       }
     }
-    return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
+    return tailBuffer.get(positionInTailArr + length) == TERMINATING_CHARACTER ? index : 0;
     
   }
   

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java Fri Jan  6 18:37:10 2012
@@ -216,7 +216,7 @@ public class GraphvizFormatter {
         return BOS_LABEL;
       }
     } else {
-      return node.getSurfaceForm();
+      return node.getSurfaceFormString();
     }
   }
   

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Fri Jan  6 18:37:10 2012
@@ -57,9 +57,9 @@ public class Viterbi {
   
   private static final int SEARCH_MODE_PENALTY = 10000;
   
-  private static final String BOS = "BOS";
+  private static final char[] BOS = "BOS".toCharArray();
   
-  private static final String EOS = "EOS";
+  private static final char[] EOS = "EOS".toCharArray();
   
   /**
    * Constructor
@@ -131,13 +131,14 @@ public class Viterbi {
           // "Search mode". Add extra costs if it is long node.
           if (searchMode) {
             //						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
-            String surfaceForm = node.getSurfaceForm();
-            int length = surfaceForm.length();
+            char[] surfaceForm = node.getSurfaceForm();
+            int offset = node.getOffset();
+            int length = node.getLength();
             if (length > SEARCH_MODE_LENGTH_KANJI) {
               boolean allKanji = true;
               // check if node consists of only kanji
               for (int pos = 0; pos < length; pos++) {
-                if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
+                if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
                   allKanji = false;
                   break;
                 }				
@@ -176,9 +177,11 @@ public class Viterbi {
         int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
         int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
         int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
-        String surfaceForm = leftNode.getSurfaceForm();
-        for (int i = surfaceForm.length(); i > 0; i--) {
-          ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
+        char[] surfaceForm = leftNode.getSurfaceForm();
+        int offset = leftNode.getOffset();
+        int length = leftNode.getLength();
+        for (int i = length; i > 0; i--) {
+          ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i - 1, 1, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
           result.addFirst(uniGramNode);
         }
       } else {
@@ -195,41 +198,40 @@ public class Viterbi {
    * Build lattice from input text
    * @param text
    */
-  public ViterbiNode[][][] build(String text) {
-    int textLength = text.length();
-    ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
-    ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
-    int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
-    int[] endSizeArr = new int[textLength + 2];   // array to keep ViterbiNode count in endIndexArr
+  public ViterbiNode[][][] build(char text[], int offset, int length) {
+    ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][];  // text length + BOS and EOS
+    ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][];  // text length + BOS and EOS
+    int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
+    int[] endSizeArr = new int[length + 2];   // array to keep ViterbiNode count in endIndexArr
     
-    ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
+    ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
     addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
     
     // Process user dictionary;
     if (useUserDictionary) {
-      processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+      processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
     }
     
     int unknownWordEndIndex = -1;	// index of the last character of unknown word
     
-    for (int startIndex = 0; startIndex < textLength; startIndex++) {
+    for (int startIndex = 0; startIndex < length; startIndex++) {
       // If no token ends where current token starts, skip this index
       if (endSizeArr[startIndex + 1] == 0) {
         continue;
       }
       
-      String suffix = text.substring(startIndex);
+      int suffixStart = offset + startIndex;
+      int suffixLength = length - startIndex;
       
       boolean found = false;
-      for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
-        String prefix = suffix.substring(0, endIndex);
+      for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
         
-        int result = trie.lookup(prefix);
+        int result = trie.lookup(text, suffixStart, endIndex);
         
         if (result > 0) {	// Found match in double array trie
           found = true;	// Don't produce unknown word starting from this index
           for (int wordId : dictionary.lookupWordIds(result)) {
-            ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
+            ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
             addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
           }
         } else if(result < 0) {	// If result is less than zero, continue to next position
@@ -242,31 +244,30 @@ public class Viterbi {
         continue;
       }
       
-      // Process Unknown Word
+      // Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
       int unknownWordLength = 0;
-      char firstCharacter = suffix.charAt(0);
+      char firstCharacter = text[suffixStart];
       boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
       if (isInvoke){	// Process "invoke"
-        unknownWordLength = unkDictionary.lookup(suffix);
+        unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
       } else if (found == false){	// Process not "invoke"
-        unknownWordLength = unkDictionary.lookup(suffix);				
+        unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);				
       }
       
       if (unknownWordLength > 0) {      // found unknown word
-        String unkWord = suffix.substring(0, unknownWordLength);
         int characterId = characterDefinition.getCharacterClass(firstCharacter);
         int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same
         
         for (int wordId : wordIds) {
-          ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
+          ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
           addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
         }
         unknownWordEndIndex = startIndex + unknownWordLength;
       }
     }
     
-    ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
-    addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
+    ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
+    addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
     
     ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
     
@@ -281,13 +282,13 @@ public class Viterbi {
    * @param startSizeArr
    * @param endSizeArr
    */
-  private void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
-    int[][] result = userDictionary.lookup(text);
+  private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
+    int[][] result = userDictionary.lookup(text, offset, len);
     for(int[] segmentation : result) {
       int wordId = segmentation[0];
       int index = segmentation[1];
       int length = segmentation[2];
-      ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
+      ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
       addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
     }
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java Fri Jan  6 18:37:10 2012
@@ -26,7 +26,9 @@ public final class ViterbiNode {
   
   private final int wordId;
   
-  private final String surfaceForm;
+  private final char[] surfaceForm;
+  private final int offset;
+  private final int length;
   
   private final int leftId;
   
@@ -44,9 +46,11 @@ public final class ViterbiNode {
   
   private final int startIndex;
   
-  public ViterbiNode(int wordId, String surfaceForm, int leftId, int rightId, int wordCost, int startIndex, Type type) {
+  public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
     this.wordId = wordId;
     this.surfaceForm = surfaceForm;
+    this.offset = offset;
+    this.length = length;
     this.leftId = leftId;
     this.rightId = rightId;
     this.wordCost = wordCost;
@@ -65,11 +69,32 @@ public final class ViterbiNode {
   /**
    * @return the surfaceForm
    */
-  public String getSurfaceForm() {
+  public char[] getSurfaceForm() {
     return surfaceForm;
   }
   
   /**
+   * @return start offset into surfaceForm
+   */
+  public int getOffset() {
+    return offset;
+  }
+  
+  /**
+   * @return length of surfaceForm
+   */
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * @return the surfaceForm as a String
+   */
+  public String getSurfaceFormString() {
+    return new String(surfaceForm, offset, length);
+  }
+  
+  /**
    * @return the leftId
    */
   public int getLeftId() {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Fri Jan  6 18:37:10 2012
@@ -59,7 +59,7 @@ public class TokenizerTest extends Lucen
     List<Token> tokens = tokenizer.tokenize(input);
     assertTrue(tokens.size() == surfaceForms.length);
     for (int i = 0; i < tokens.size(); i++) {
-      assertEquals(surfaceForms[i], tokens.get(i).getSurfaceForm());
+      assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
     }
   }
   

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Fri Jan  6 18:37:10 2012
@@ -30,7 +30,8 @@ public class UserDictionaryTest extends 
   public void testLookup() throws IOException {
     
     UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
-    int[][] dictionaryEntryResult = dictionary.lookup("関西国際空港に行った");
+    String s = "関西国際空港に行った";
+    int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
     // Length should be three 関西, 国際, 空港
     assertEquals(3, dictionaryEntryResult.length);
     
@@ -44,7 +45,8 @@ public class UserDictionaryTest extends 
     assertEquals(2, dictionaryEntryResult[1][2]); // length of 国際
     assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
     
-    int[][] dictionaryEntryResult2 = dictionary.lookup("関西国際空港と関西国際空港に行った");
+    s = "関西国際空港と関西国際空港に行った";
+    int[][] dictionaryEntryResult2 = dictionary.lookup(s.toCharArray(), 0, s.length());
     // Length should be six 
     assertEquals(6, dictionaryEntryResult2.length);
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java Fri Jan  6 18:37:10 2012
@@ -32,10 +32,10 @@ public class DoubleArrayTrieTest extends
   public void test() {		
     Trie trie = getTrie();
     DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie(trie);
-    assertEquals(0, doubleArrayTrie.lookup("a"));
-    assertTrue(doubleArrayTrie.lookup("abc") > 0);
-    assertTrue(doubleArrayTrie.lookup("あいう") > 0);
-    assertTrue(doubleArrayTrie.lookup("xyz") < 0);
+    assertEquals(0, doubleArrayTrie.lookup("a".toCharArray(), 0, 1));
+    assertTrue(doubleArrayTrie.lookup("abc".toCharArray(), 0, 3) > 0);
+    assertTrue(doubleArrayTrie.lookup("あいう".toCharArray(), 0, 3) > 0);
+    assertTrue(doubleArrayTrie.lookup("xyz".toCharArray(), 0, 3) < 0);
   }
   
   private Trie getTrie() {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java?rev=1228334&r1=1228333&r2=1228334&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java Fri Jan  6 18:37:10 2012
@@ -48,7 +48,7 @@ public class DictionaryBuilder {
     for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
       int tokenInfoId = entry.getKey();
       String surfaceform = entry.getValue();
-      int doubleArrayId = trie.lookup(surfaceform);
+      int doubleArrayId = trie.lookup(surfaceform.toCharArray(), 0, surfaceform.length());
       assert doubleArrayId > 0;
       tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
     }