You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:33:57 UTC

svn commit: r1226637 [3/3] - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucen...

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Tue Jan  3 04:33:56 2012
@@ -31,323 +31,323 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
 public class Viterbi {
-
-	private final DoubleArrayTrie trie;
-	
-	private final TokenInfoDictionary dictionary;
-	
-	private final UnknownDictionary unkDictionary;
-	
-	private final ConnectionCosts costs;
-	
-	private final UserDictionary userDictionary;
-	
-	private final CharacterDefinition characterDefinition;
-	
-	private final boolean useUserDictionary;
-
-	private final boolean searchMode;
-	
-	private final boolean extendedMode;
-	
-	private static final int DEFAULT_COST = 10000000;
-
-	private static final int SEARCH_MODE_LENGTH_KANJI = 3;
-
-	private static final int SEARCH_MODE_LENGTH = 7;
-
-	private static final int SEARCH_MODE_PENALTY = 10000;
-		
-	private static final String BOS = "BOS";
-	
-	private static final String EOS = "EOS";
-
-	/**
-	 * Constructor
-	 * @param trie
-	 * @param targetMap
-	 * @param dictionary
-	 * @param unkDictionary
-	 * @param costs
-	 * @param userDictionary
-	 */
-	public Viterbi(DoubleArrayTrie trie,
-				   TokenInfoDictionary dictionary,
-				   UnknownDictionary unkDictionary,
-				   ConnectionCosts costs,
-				   UserDictionary userDictionary,
-				   Mode mode) {
-		this.trie = trie;
-		this.dictionary = dictionary;
-		this.unkDictionary = unkDictionary;
-		this.costs = costs;
-		this.userDictionary = userDictionary;
-		if(userDictionary == null) {
-			this.useUserDictionary = false;
-		} else {
-			this.useUserDictionary = true;
-		}
-
-		switch(mode){
-		case SEARCH:
-			searchMode = true;
-			extendedMode = false;
-			break;
-		case EXTENDED:
-			searchMode = true;
-			extendedMode = true;
-			break;
-		default:
-			searchMode = false;
-			extendedMode = false;
-			break;
-		}
-
-		this.characterDefinition = unkDictionary.getCharacterDefinition();
-	}
-
-	/**
-	 * Find best path from input lattice.
-	 * @param lattice the result of build method
-	 * @return	List of ViterbiNode which consist best path 
-	 */
-	public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
-		ViterbiNode[][] startIndexArr = lattice[0];
-		ViterbiNode[][] endIndexArr = lattice[1];
-		
-		for (int i = 1; i < startIndexArr.length; i++){
-
-			if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
-				continue;
-			}
-
-			for (ViterbiNode node : startIndexArr[i]) {
-				if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
-					break;
-				}
-
-				int backwardConnectionId = node.getLeftId();
-				int wordCost = node.getWordCost();
-				int leastPathCost = DEFAULT_COST;
-				for (ViterbiNode leftNode : endIndexArr[i]) {
-					if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
-						break;
-					}
-					
-					int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost;	// cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
-
-					// "Search mode". Add extra costs if it is long node.
-					if (searchMode) {
-//						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
-						String surfaceForm = node.getSurfaceForm();
-						int length = surfaceForm.length();
-						if (length > SEARCH_MODE_LENGTH_KANJI) {
-							boolean allKanji = true;
-							// check if node consists of only kanji
-							for (int pos = 0; pos < length; pos++) {
-								if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
-									allKanji = false;
-									break;
-								}				
-							}
-							
-							if (allKanji) {	// Process only Kanji keywords
-								pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
-							} else if (length > SEARCH_MODE_LENGTH) {
-								pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;								
-							}
-						}
-					}
-					
-					if (pathCost < leastPathCost){	// If total cost is lower than before, set current previous node as best left node (previous means left).
-						leastPathCost = pathCost;
-						node.setPathCost(leastPathCost);
-						node.setLeftNode(leftNode);
-					}					
-				}
-			}
-		}
-
-		// track best path
-		ViterbiNode node = endIndexArr[0][0];	// EOS
-		LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
-		result.add(node);
-		while (true) {
-			ViterbiNode leftNode = node.getLeftNode();
-			if (leftNode == null) {
-				break;
-			}
-			
-			// EXTENDED mode convert unknown word into unigram node
-			if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
-				int unigramWordId = CharacterClass.NGRAM.getId();
-				int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
-				int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
-				int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
-				String surfaceForm = leftNode.getSurfaceForm();
-				for (int i = surfaceForm.length(); i > 0; i--) {
-					ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
-					result.addFirst(uniGramNode);
-				}
-			} else {
-				result.addFirst(leftNode);		
-			}
-			node = leftNode;
-		}
-		
-		return result;
-	}
-	
-
-	/**
-	 * Build lattice from input text
-	 * @param text
-	 * @return
-	 */
-	public ViterbiNode[][][] build(String text) {
-		int textLength = text.length();
-		ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
-		ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
-		int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
-		int[] endSizeArr = new int[textLength + 2];   // array to keep ViterbiNode count in endIndexArr
-		
-		ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
-		addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-
-		// Process user dictionary;
-		if (useUserDictionary) {
-			processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-		}
-		
-		int unknownWordEndIndex = -1;	// index of the last character of unknown word
-
-		for (int startIndex = 0; startIndex < textLength; startIndex++) {
-			// If no token ends where current token starts, skip this index
-			if (endSizeArr[startIndex + 1] == 0) {
-				continue;
-			}
-			
-			String suffix = text.substring(startIndex);
-
-			boolean found = false;
-			for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
-				String prefix = suffix.substring(0, endIndex);
-				
-				int result = trie.lookup(prefix);
-
-				if (result > 0) {	// Found match in double array trie
-					found = true;	// Don't produce unknown word starting from this index
-					for (int wordId : dictionary.lookupWordIds(result)) {
-						ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
-						addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-					}
-				} else if(result < 0) {	// If result is less than zero, continue to next position
-						break;						
-				}
-			}
-
-			// In the case of normal mode, it doesn't process unknown word greedily.
-			if(!searchMode && unknownWordEndIndex > startIndex){
-				continue;
-			}
-			
-			// Process Unknown Word
-			int unknownWordLength = 0;
-			char firstCharacter = suffix.charAt(0);
-			boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
-			if (isInvoke){	// Process "invoke"
-				unknownWordLength = unkDictionary.lookup(suffix);
-			} else if (found == false){	// Process not "invoke"
-				unknownWordLength = unkDictionary.lookup(suffix);				
-			}
-			
-			if (unknownWordLength > 0) {      // found unknown word
-				String unkWord = suffix.substring(0, unknownWordLength);
-				int characterId = characterDefinition.lookup(firstCharacter);
-				int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same
-				
-				for (int wordId : wordIds) {
-					ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
-					addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-				}
-				unknownWordEndIndex = startIndex + unknownWordLength;
-			}
-		}
-		
-		ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
-		addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
-		
-		ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
-		
-		return result;
-	}
-
-	/**
-	 * Find token(s) in input text and set found token(s) in arrays as normal tokens
-	 * @param text	
-	 * @param startIndexArr
-	 * @param endIndexArr
-	 * @param startSizeArr
-	 * @param endSizeArr
-	 */
-	private void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
-		int[][] result = userDictionary.lookup(text);
-		for(int[] segmentation : result) {
-			int wordId = segmentation[0];
-			int index = segmentation[1];
-			int length = segmentation[2];
-			ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
-			addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-		}
-	}
-	
-	/**
-	 * Add node to arrays and increment count in size array
-	 * @param node
-	 * @param startIndex
-	 * @param endIndex
-	 * @param startIndexArr
-	 * @param endIndexArr
-	 * @param startSizeArr
-	 * @param endSizeArr
-	 */
-	private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
-		int startNodesCount = startSizeArr[startIndex];
-		int endNodesCount = endSizeArr[endIndex];
-
-		if (startNodesCount == 0) {
-			startIndexArr[startIndex] = new ViterbiNode[10];
-		}
-
-		if (endNodesCount == 0) {
-			endIndexArr[endIndex] = new ViterbiNode[10];
-		}
-
-		if (startIndexArr[startIndex].length <= startNodesCount){
-			startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
-		}
-		
-		if (endIndexArr[endIndex].length <= endNodesCount){
-			endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
-		}
-				
-		startIndexArr[startIndex][startNodesCount] = node;
-		endIndexArr[endIndex][endNodesCount] = node;
-		
-		startSizeArr[startIndex] = startNodesCount + 1;
-		endSizeArr[endIndex] = endNodesCount + 1;
-	}
-	
-
-	/**
-	 * Return twice as big array which contains value of input array
-	 * @param array
-	 * @return
-	 */
-	private ViterbiNode[] extendArray(ViterbiNode[] array) {
-		//extend array
-		ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
-		System.arraycopy(array, 0, newArray, 0, array.length);
-		return newArray;
-	}
+  
+  private final DoubleArrayTrie trie;
+  
+  private final TokenInfoDictionary dictionary;
+  
+  private final UnknownDictionary unkDictionary;
+  
+  private final ConnectionCosts costs;
+  
+  private final UserDictionary userDictionary;
+  
+  private final CharacterDefinition characterDefinition;
+  
+  private final boolean useUserDictionary;
+  
+  private final boolean searchMode;
+  
+  private final boolean extendedMode;
+  
+  private static final int DEFAULT_COST = 10000000;
+  
+  private static final int SEARCH_MODE_LENGTH_KANJI = 3;
+  
+  private static final int SEARCH_MODE_LENGTH = 7;
+  
+  private static final int SEARCH_MODE_PENALTY = 10000;
+  
+  private static final String BOS = "BOS";
+  
+  private static final String EOS = "EOS";
+  
+  /**
+   * Constructor
+   * @param trie
+   * @param targetMap
+   * @param dictionary
+   * @param unkDictionary
+   * @param costs
+   * @param userDictionary
+   */
+  public Viterbi(DoubleArrayTrie trie,
+      TokenInfoDictionary dictionary,
+      UnknownDictionary unkDictionary,
+      ConnectionCosts costs,
+      UserDictionary userDictionary,
+      Mode mode) {
+    this.trie = trie;
+    this.dictionary = dictionary;
+    this.unkDictionary = unkDictionary;
+    this.costs = costs;
+    this.userDictionary = userDictionary;
+    if(userDictionary == null) {
+      this.useUserDictionary = false;
+    } else {
+      this.useUserDictionary = true;
+    }
+    
+    switch(mode){
+      case SEARCH:
+        searchMode = true;
+        extendedMode = false;
+        break;
+      case EXTENDED:
+        searchMode = true;
+        extendedMode = true;
+        break;
+      default:
+        searchMode = false;
+        extendedMode = false;
+        break;
+    }
+    
+    this.characterDefinition = unkDictionary.getCharacterDefinition();
+  }
+  
+  /**
+   * Find best path from input lattice.
+   * @param lattice the result of build method
+   * @return	List of ViterbiNode which consist best path 
+   */
+  public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
+    ViterbiNode[][] startIndexArr = lattice[0];
+    ViterbiNode[][] endIndexArr = lattice[1];
+    
+    for (int i = 1; i < startIndexArr.length; i++){
+      
+      if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
+        continue;
+      }
+      
+      for (ViterbiNode node : startIndexArr[i]) {
+        if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
+          break;
+        }
+        
+        int backwardConnectionId = node.getLeftId();
+        int wordCost = node.getWordCost();
+        int leastPathCost = DEFAULT_COST;
+        for (ViterbiNode leftNode : endIndexArr[i]) {
+          if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
+            break;
+          }
+          
+          int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost;	// cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
+          
+          // "Search mode". Add extra costs if it is long node.
+          if (searchMode) {
+            //						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
+            String surfaceForm = node.getSurfaceForm();
+            int length = surfaceForm.length();
+            if (length > SEARCH_MODE_LENGTH_KANJI) {
+              boolean allKanji = true;
+              // check if node consists of only kanji
+              for (int pos = 0; pos < length; pos++) {
+                if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
+                  allKanji = false;
+                  break;
+                }				
+              }
+              
+              if (allKanji) {	// Process only Kanji keywords
+                pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
+              } else if (length > SEARCH_MODE_LENGTH) {
+                pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;								
+              }
+            }
+          }
+          
+          if (pathCost < leastPathCost){	// If total cost is lower than before, set current previous node as best left node (previous means left).
+            leastPathCost = pathCost;
+            node.setPathCost(leastPathCost);
+            node.setLeftNode(leftNode);
+          }					
+        }
+      }
+    }
+    
+    // track best path
+    ViterbiNode node = endIndexArr[0][0];	// EOS
+    LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
+    result.add(node);
+    while (true) {
+      ViterbiNode leftNode = node.getLeftNode();
+      if (leftNode == null) {
+        break;
+      }
+      
+      // EXTENDED mode convert unknown word into unigram node
+      if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
+        int unigramWordId = CharacterClass.NGRAM.getId();
+        int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
+        int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
+        int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
+        String surfaceForm = leftNode.getSurfaceForm();
+        for (int i = surfaceForm.length(); i > 0; i--) {
+          ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
+          result.addFirst(uniGramNode);
+        }
+      } else {
+        result.addFirst(leftNode);		
+      }
+      node = leftNode;
+    }
+    
+    return result;
+  }
+  
+  
+  /**
+   * Build lattice from input text
+   * @param text
+   * @return
+   */
+  public ViterbiNode[][][] build(String text) {
+    int textLength = text.length();
+    ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
+    ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][];  // text length + BOS and EOS
+    int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
+    int[] endSizeArr = new int[textLength + 2];   // array to keep ViterbiNode count in endIndexArr
+    
+    ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
+    addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+    
+    // Process user dictionary;
+    if (useUserDictionary) {
+      processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+    }
+    
+    int unknownWordEndIndex = -1;	// index of the last character of unknown word
+    
+    for (int startIndex = 0; startIndex < textLength; startIndex++) {
+      // If no token ends where current token starts, skip this index
+      if (endSizeArr[startIndex + 1] == 0) {
+        continue;
+      }
+      
+      String suffix = text.substring(startIndex);
+      
+      boolean found = false;
+      for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
+        String prefix = suffix.substring(0, endIndex);
+        
+        int result = trie.lookup(prefix);
+        
+        if (result > 0) {	// Found match in double array trie
+          found = true;	// Don't produce unknown word starting from this index
+          for (int wordId : dictionary.lookupWordIds(result)) {
+            ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
+            addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+          }
+        } else if(result < 0) {	// If result is less than zero, continue to next position
+          break;						
+        }
+      }
+      
+      // In the case of normal mode, it doesn't process unknown word greedily.
+      if(!searchMode && unknownWordEndIndex > startIndex){
+        continue;
+      }
+      
+      // Process Unknown Word
+      int unknownWordLength = 0;
+      char firstCharacter = suffix.charAt(0);
+      boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
+      if (isInvoke){	// Process "invoke"
+        unknownWordLength = unkDictionary.lookup(suffix);
+      } else if (found == false){	// Process not "invoke"
+        unknownWordLength = unkDictionary.lookup(suffix);				
+      }
+      
+      if (unknownWordLength > 0) {      // found unknown word
+        String unkWord = suffix.substring(0, unknownWordLength);
+        int characterId = characterDefinition.lookup(firstCharacter);
+        int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same
+        
+        for (int wordId : wordIds) {
+          ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
+          addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+        }
+        unknownWordEndIndex = startIndex + unknownWordLength;
+      }
+    }
+    
+    ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
+    addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
+    
+    ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
+    
+    return result;
+  }
+  
+  /**
+   * Find token(s) in input text and set found token(s) in arrays as normal tokens
+   * @param text	
+   * @param startIndexArr
+   * @param endIndexArr
+   * @param startSizeArr
+   * @param endSizeArr
+   */
+  private void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
+    int[][] result = userDictionary.lookup(text);
+    for(int[] segmentation : result) {
+      int wordId = segmentation[0];
+      int index = segmentation[1];
+      int length = segmentation[2];
+      ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
+      addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+    }
+  }
+  
+  /**
+   * Add node to arrays and increment count in size array
+   * @param node
+   * @param startIndex
+   * @param endIndex
+   * @param startIndexArr
+   * @param endIndexArr
+   * @param startSizeArr
+   * @param endSizeArr
+   */
+  private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
+    int startNodesCount = startSizeArr[startIndex];
+    int endNodesCount = endSizeArr[endIndex];
+    
+    if (startNodesCount == 0) {
+      startIndexArr[startIndex] = new ViterbiNode[10];
+    }
+    
+    if (endNodesCount == 0) {
+      endIndexArr[endIndex] = new ViterbiNode[10];
+    }
+    
+    if (startIndexArr[startIndex].length <= startNodesCount){
+      startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
+    }
+    
+    if (endIndexArr[endIndex].length <= endNodesCount){
+      endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
+    }
+    
+    startIndexArr[startIndex][startNodesCount] = node;
+    endIndexArr[endIndex][endNodesCount] = node;
+    
+    startSizeArr[startIndex] = startNodesCount + 1;
+    endSizeArr[endIndex] = endNodesCount + 1;
+  }
+  
+  
+  /**
+   * Return twice as big array which contains value of input array
+   * @param array
+   * @return
+   */
+  private ViterbiNode[] extendArray(ViterbiNode[] array) {
+    //extend array
+    ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
+    System.arraycopy(array, 0, newArray, 0, array.length);
+    return newArray;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java Tue Jan  3 04:33:56 2012
@@ -18,105 +18,105 @@ package org.apache.lucene.analysis.kurom
  */
 
 public class ViterbiNode {
-	public enum Type {
-		KNOWN,
-		UNKNOWN,
-		USER
-	}
-
-	private final int wordId;
-	
-	private final String surfaceForm;
-	
-	private final int leftId;
-	
-	private final int rightId;
-	
-	/** word cost for this node */
-	private final int wordCost;
-	
-	/** minimum path cost found thus far */
-	private int pathCost;
-		
-	private ViterbiNode leftNode;
-
-	private final Type type;
-	
-	private final int startIndex;
-	
-	public ViterbiNode(int wordId, String surfaceForm, int leftId, int rightId, int wordCost, int startIndex, Type type) {
-		this.wordId = wordId;
-		this.surfaceForm = surfaceForm;
-		this.leftId = leftId;
-		this.rightId = rightId;
-		this.wordCost = wordCost;
-		this.startIndex = startIndex;
-		this.type = type;
-	}
-	
-
-	/**
-	 * @return the wordId
-	 */
-	public int getWordId() {
-		return wordId;
-	}
-
-	/**
-	 * @return the surfaceForm
-	 */
-	public String getSurfaceForm() {
-		return surfaceForm;
-	}
-
-	/**
-	 * @return the leftId
-	 */
-	public int getLeftId() {
-		return leftId;
-	}
-
-	/**
-	 * @return the rightId
-	 */
-	public int getRightId() {
-		return rightId;
-	}
-
-	/**
-	 * @return the cost
-	 */
-	public int getWordCost() {
-		return wordCost;
-	}
-
-	/**
-	 * @return the cost
-	 */
-	public int getPathCost() {
-		return pathCost;
-	}
-
-	/**
-	 * param cost minimum path cost found this far
-	 */
-	public void setPathCost(int pathCost) {
-		this.pathCost = pathCost;
-	}
-	
-	public void setLeftNode(ViterbiNode node) {
-		leftNode = node;
-	}
-
-	public ViterbiNode getLeftNode() {
-		return leftNode;
-	}
-
-	public int getStartIndex() {
-		return startIndex;
-	}
-
-	public Type getType() {
-		return type;
-	}
+  public enum Type {
+    KNOWN,
+    UNKNOWN,
+    USER
+  }
+  
+  private final int wordId;
+  
+  private final String surfaceForm;
+  
+  private final int leftId;
+  
+  private final int rightId;
+  
+  /** word cost for this node */
+  private final int wordCost;
+  
+  /** minimum path cost found thus far */
+  private int pathCost;
+  
+  private ViterbiNode leftNode;
+  
+  private final Type type;
+  
+  private final int startIndex;
+  
+  public ViterbiNode(int wordId, String surfaceForm, int leftId, int rightId, int wordCost, int startIndex, Type type) {
+    this.wordId = wordId;
+    this.surfaceForm = surfaceForm;
+    this.leftId = leftId;
+    this.rightId = rightId;
+    this.wordCost = wordCost;
+    this.startIndex = startIndex;
+    this.type = type;
+  }
+  
+  
+  /**
+   * @return the wordId
+   */
+  public int getWordId() {
+    return wordId;
+  }
+  
+  /**
+   * @return the surfaceForm
+   */
+  public String getSurfaceForm() {
+    return surfaceForm;
+  }
+  
+  /**
+   * @return the leftId
+   */
+  public int getLeftId() {
+    return leftId;
+  }
+  
+  /**
+   * @return the rightId
+   */
+  public int getRightId() {
+    return rightId;
+  }
+  
+  /**
+   * @return the cost
+   */
+  public int getWordCost() {
+    return wordCost;
+  }
+  
+  /**
+   * @return the cost
+   */
+  public int getPathCost() {
+    return pathCost;
+  }
+  
+  /**
+   * param cost minimum path cost found this far
+   */
+  public void setPathCost(int pathCost) {
+    this.pathCost = pathCost;
+  }
+  
+  public void setLeftNode(ViterbiNode node) {
+    leftNode = node;
+  }
+  
+  public ViterbiNode getLeftNode() {
+    return leftNode;
+  }
+  
+  public int getStartIndex() {
+    return startIndex;
+  }
+  
+  public Type getType() {
+    return type;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Tue Jan  3 04:33:56 2012
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
-import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.util.List;
@@ -30,69 +29,69 @@ import org.junit.BeforeClass;
 import org.junit.Test;
 
 public class TokenizerTest extends LuceneTestCase {
-
-	private static Tokenizer tokenizer;
-	
-	@BeforeClass
-	public static void setUpBeforeClass() throws Exception {
-		tokenizer = Tokenizer.builder().build();
-	}
-	
-	@AfterClass
-	public static void afterClass() throws Exception {
-	  tokenizer = null;
-	}
-
-	@Test
-	public void testSegmentation() {
-		// Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
-//		String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
-//		String[] surfaceForms = {
-//				"ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
-//				"スペース", "ステーション", "に", "行き", "ます", "。",
-//				"うたがわしい", "。"
-//		};
-		String input = "スペースステーションに行きます。うたがわしい。";
-		String[] surfaceForms = {
-				"スペース", "ステーション", "に", "行き", "ます", "。",
-				"うたがわしい", "。"
-		};
-		List<Token> tokens = tokenizer.tokenize(input);
-		assertTrue(tokens.size() == surfaceForms.length);
-		for (int i = 0; i < tokens.size(); i++) {
-			assertEquals(surfaceForms[i], tokens.get(i).getSurfaceForm());
-		}
-	}
-	
-	
-	@Test
-	public void testReadings() {
-		List<Token> tokens = tokenizer.tokenize("寿司が食べたいです。");
-		assertTrue(tokens.size() == 6);
-		assertEquals(tokens.get(0).getReading(), "スシ");
-		assertEquals(tokens.get(1).getReading(), "ガ");
-		assertEquals(tokens.get(2).getReading(), "タベ");
-		assertEquals(tokens.get(3).getReading(), "タイ");
-		assertEquals(tokens.get(4).getReading(), "デス");
-		assertEquals(tokens.get(5).getReading(), "。");
-	}
-	
-	public void testBocchan() throws Exception {
-	  doTestBocchan(1);
-	}
-	
-	@Test @Nightly
-	public void testBocchanBig() throws Exception {
-		doTestBocchan(100);
-	}
-	
-	private void doTestBocchan(int numIterations) throws Exception {
-	  LineNumberReader reader = new LineNumberReader(new InputStreamReader(
+  
+  private static Tokenizer tokenizer;
+  
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    tokenizer = Tokenizer.builder().build();
+  }
+  
+  @AfterClass
+  public static void afterClass() throws Exception {
+    tokenizer = null;
+  }
+  
+  @Test
+  public void testSegmentation() {
+    // Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
+    //		String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
+    //		String[] surfaceForms = {
+    //				"ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
+    //				"スペース", "ステーション", "に", "行き", "ます", "。",
+    //				"うたがわしい", "。"
+    //		};
+    String input = "スペースステーションに行きます。うたがわしい。";
+    String[] surfaceForms = {
+        "スペース", "ステーション", "に", "行き", "ます", "。",
+        "うたがわしい", "。"
+    };
+    List<Token> tokens = tokenizer.tokenize(input);
+    assertTrue(tokens.size() == surfaceForms.length);
+    for (int i = 0; i < tokens.size(); i++) {
+      assertEquals(surfaceForms[i], tokens.get(i).getSurfaceForm());
+    }
+  }
+  
+  
+  @Test
+  public void testReadings() {
+    List<Token> tokens = tokenizer.tokenize("寿司が食べたいです。");
+    assertTrue(tokens.size() == 6);
+    assertEquals(tokens.get(0).getReading(), "スシ");
+    assertEquals(tokens.get(1).getReading(), "ガ");
+    assertEquals(tokens.get(2).getReading(), "タベ");
+    assertEquals(tokens.get(3).getReading(), "タイ");
+    assertEquals(tokens.get(4).getReading(), "デス");
+    assertEquals(tokens.get(5).getReading(), "。");
+  }
+  
+  public void testBocchan() throws Exception {
+    doTestBocchan(1);
+  }
+  
+  @Test @Nightly
+  public void testBocchanBig() throws Exception {
+    doTestBocchan(100);
+  }
+  
+  private void doTestBocchan(int numIterations) throws Exception {
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(
         this.getClass().getResourceAsStream("bocchan.utf-8")));
     
     String line = reader.readLine();
     reader.close();
-
+    
     if (VERBOSE) {
       System.out.println("Test for Bocchan without pre-splitting sentences");
     }
@@ -114,5 +113,5 @@ public class TokenizerTest extends Lucen
     if (VERBOSE) {
       System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
     }
-	}
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java Tue Jan  3 04:33:56 2012
@@ -28,109 +28,109 @@ import org.apache.lucene.util.LuceneTest
 import org.junit.Test;
 
 public class UnknownDictionaryTest extends LuceneTestCase {
-	public static final String FILENAME = "unk-tokeninfo-dict.obj";
-
-	@Test
-	public void testPutCharacterCategory() {
-		UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
-		
-		try{
-			unkDic.putCharacterCategory(0, "DUMMY_NAME");
-			fail();
-		} catch(Exception e) {
-			
-		}
-
-		try{
-			unkDic.putCharacterCategory(-1, "KATAKANA");
-			fail();
-		} catch(Exception e) {
-			
-		}
-		
-		unkDic.putCharacterCategory(0, "DEFAULT");
-		unkDic.putCharacterCategory(1, "GREEK");
-		unkDic.putCharacterCategory(2, "HIRAGANA");
-		unkDic.putCharacterCategory(3, "KATAKANA");
-		unkDic.putCharacterCategory(4, "KANJI");
-	}
-	
-	@Test
-	public void testPut() {
-		UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
-		try{
-			unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*"));
-			fail();
-		} catch(Exception e){
-			
-		}
-		
-		String entry1 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*";
-		String entry2 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*";
-		String entry3 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*";
-		
-		unkDic.putCharacterCategory(0, "KANJI");
-		unkDic.putCharacterCategory(1, "ALPHA");
-		unkDic.putCharacterCategory(2, "HIRAGANA");
-		
-		unkDic.put(CSVUtil.parse(entry1));
-		unkDic.put(CSVUtil.parse(entry2));
-		unkDic.put(CSVUtil.parse(entry3));
-	}
-	
-	private UnknownDictionary createDictionary() throws IOException {
-		InputStream is = this.getClass().getClassLoader().getResourceAsStream("unk.def.utf-8");
-		UnknownDictionary dictionary = new UnknownDictionary();
-		BufferedReader reader = new BufferedReader(new InputStreamReader(is));
-		
-		String line = null;
-		while((line = reader.readLine()) != null) {
-			dictionary.put(CSVUtil.parse(line));
-		}
-		reader.close();
-
-		is = this.getClass().getClassLoader().getResourceAsStream("char.def.utf-8");
-		reader = new BufferedReader(new InputStreamReader(is));
-		
-		line = null;
-		while ((line = reader.readLine()) != null) {
-			line = line.replaceAll("^\\s", "");
-			line = line.replaceAll("\\s*#.*", "");
-			line = line.replaceAll("\\s+", " ");
-			
-			// Skip empty line or comment line
-			if(line.length() == 0) {
-				continue;
-			}
-			
-			if(line.startsWith("0x")) {	// Category mapping
-				String[] values = line.split(" ", 2);	// Split only first space
-				
-				if(!values[0].contains("..")) {
-					int cp = Integer.decode(values[0]).intValue();
-					dictionary.putCharacterCategory(cp, values[1]);					
-				} else {
-					String[] codePoints = values[0].split("\\.\\.");
-					int cpFrom = Integer.decode(codePoints[0]).intValue();
-					int cpTo = Integer.decode(codePoints[1]).intValue();
-					
-					for(int i = cpFrom; i <= cpTo; i++){
-						dictionary.putCharacterCategory(i, values[1]);					
-					}
-				}
-			} else {	// Invoke definition
-				String[] values = line.split(" "); // Consecutive space is merged above
-				String characterClassName = values[0];
-				int invoke = Integer.parseInt(values[1]);
-				int group = Integer.parseInt(values[2]);
-				int length = Integer.parseInt(values[3]);
-				dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
-			}
-			
-		}
-		
-		reader.close();
-		
-		return dictionary;
-	}
+  public static final String FILENAME = "unk-tokeninfo-dict.obj";
+  
+  @Test
+  public void testPutCharacterCategory() {
+    UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
+    
+    try{
+      unkDic.putCharacterCategory(0, "DUMMY_NAME");
+      fail();
+    } catch(Exception e) {
+      
+    }
+    
+    try{
+      unkDic.putCharacterCategory(-1, "KATAKANA");
+      fail();
+    } catch(Exception e) {
+      
+    }
+    
+    unkDic.putCharacterCategory(0, "DEFAULT");
+    unkDic.putCharacterCategory(1, "GREEK");
+    unkDic.putCharacterCategory(2, "HIRAGANA");
+    unkDic.putCharacterCategory(3, "KATAKANA");
+    unkDic.putCharacterCategory(4, "KANJI");
+  }
+  
+  @Test
+  public void testPut() {
+    UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
+    try{
+      unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*"));
+      fail();
+    } catch(Exception e){
+      
+    }
+    
+    String entry1 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*";
+    String entry2 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*";
+    String entry3 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*";
+    
+    unkDic.putCharacterCategory(0, "KANJI");
+    unkDic.putCharacterCategory(1, "ALPHA");
+    unkDic.putCharacterCategory(2, "HIRAGANA");
+    
+    unkDic.put(CSVUtil.parse(entry1));
+    unkDic.put(CSVUtil.parse(entry2));
+    unkDic.put(CSVUtil.parse(entry3));
+  }
+  
+  private UnknownDictionary createDictionary() throws IOException {
+    InputStream is = this.getClass().getClassLoader().getResourceAsStream("unk.def.utf-8");
+    UnknownDictionary dictionary = new UnknownDictionary();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+    
+    String line = null;
+    while((line = reader.readLine()) != null) {
+      dictionary.put(CSVUtil.parse(line));
+    }
+    reader.close();
+    
+    is = this.getClass().getClassLoader().getResourceAsStream("char.def.utf-8");
+    reader = new BufferedReader(new InputStreamReader(is));
+    
+    line = null;
+    while ((line = reader.readLine()) != null) {
+      line = line.replaceAll("^\\s", "");
+      line = line.replaceAll("\\s*#.*", "");
+      line = line.replaceAll("\\s+", " ");
+      
+      // Skip empty line or comment line
+      if(line.length() == 0) {
+        continue;
+      }
+      
+      if(line.startsWith("0x")) {	// Category mapping
+        String[] values = line.split(" ", 2);	// Split only first space
+        
+        if(!values[0].contains("..")) {
+          int cp = Integer.decode(values[0]).intValue();
+          dictionary.putCharacterCategory(cp, values[1]);					
+        } else {
+          String[] codePoints = values[0].split("\\.\\.");
+          int cpFrom = Integer.decode(codePoints[0]).intValue();
+          int cpTo = Integer.decode(codePoints[1]).intValue();
+          
+          for(int i = cpFrom; i <= cpTo; i++){
+            dictionary.putCharacterCategory(i, values[1]);					
+          }
+        }
+      } else {	// Invoke definition
+        String[] values = line.split(" "); // Consecutive space is merged above
+        String characterClassName = values[0];
+        int invoke = Integer.parseInt(values[1]);
+        int group = Integer.parseInt(values[2]);
+        int length = Integer.parseInt(values[3]);
+        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
+      }
+      
+    }
+    
+    reader.close();
+    
+    return dictionary;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Tue Jan  3 04:33:56 2012
@@ -25,53 +25,53 @@ import org.apache.lucene.util.LuceneTest
 import org.junit.Test;
 
 public class UserDictionaryTest extends LuceneTestCase {
-
-	@Test
-	public void testLookup() throws IOException {
-	 
-		UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
-		int[][] dictionaryEntryResult = dictionary.lookup("関西国際空港に行った");
-		// Length should be three 関西, 国際, 空港
-		assertEquals(3, dictionaryEntryResult.length);
-
-		// Test positions
-		assertEquals(0, dictionaryEntryResult[0][1]); // index of 関西
-		assertEquals(2, dictionaryEntryResult[1][1]); // index of 国際
-		assertEquals(4, dictionaryEntryResult[2][1]); // index of 空港
-
-		// Test lengths
-		assertEquals(2, dictionaryEntryResult[0][2]); // length of 関西
-		assertEquals(2, dictionaryEntryResult[1][2]); // length of 国際
-		assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
-
-		int[][] dictionaryEntryResult2 = dictionary.lookup("関西国際空港と関西国際空港に行った");
-		// Length should be six 
-		assertEquals(6, dictionaryEntryResult2.length);
-	}
-
-	@Test
-	public void testReadings() throws IOException {
+  
+  @Test
+  public void testLookup() throws IOException {
+    
     UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
-		int wordIdNihon = 100000000; // wordId of 日本 in 日本経済新聞
-		assertEquals("ニホン", dictionary.getReading(wordIdNihon));
-
-		int wordIdAsashoryu = 100000006; // wordId for 朝青龍
-		assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
-		
-		int wordIdNotExist = 1;
-		assertNull(dictionary.getReading(wordIdNotExist));
-	}
-	
-	@Test
-	public void testPartOfSpeech() throws IOException {
+    int[][] dictionaryEntryResult = dictionary.lookup("関西国際空港に行った");
+    // Length should be three 関西, 国際, 空港
+    assertEquals(3, dictionaryEntryResult.length);
+    
+    // Test positions
+    assertEquals(0, dictionaryEntryResult[0][1]); // index of 関西
+    assertEquals(2, dictionaryEntryResult[1][1]); // index of 国際
+    assertEquals(4, dictionaryEntryResult[2][1]); // index of 空港
+    
+    // Test lengths
+    assertEquals(2, dictionaryEntryResult[0][2]); // length of 関西
+    assertEquals(2, dictionaryEntryResult[1][2]); // length of 国際
+    assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
+    
+    int[][] dictionaryEntryResult2 = dictionary.lookup("関西国際空港と関西国際空港に行った");
+    // Length should be six 
+    assertEquals(6, dictionaryEntryResult2.length);
+  }
+  
+  @Test
+  public void testReadings() throws IOException {
+    UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
+    int wordIdNihon = 100000000; // wordId of 日本 in 日本経済新聞
+    assertEquals("ニホン", dictionary.getReading(wordIdNihon));
+    
+    int wordIdAsashoryu = 100000006; // wordId for 朝青龍
+    assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
+    
+    int wordIdNotExist = 1;
+    assertNull(dictionary.getReading(wordIdNotExist));
+  }
+  
+  @Test
+  public void testPartOfSpeech() throws IOException {
     UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
-		int wordIdKeizai = 100000001; // wordId of 経済 in 日本経済新聞
-		assertEquals("カスタム名詞", dictionary.getPartOfSpeech(wordIdKeizai));
-	}
-	
-	@Test
-	public void testRead() throws IOException {
+    int wordIdKeizai = 100000001; // wordId of 経済 in 日本経済新聞
+    assertEquals("カスタム名詞", dictionary.getPartOfSpeech(wordIdKeizai));
+  }
+  
+  @Test
+  public void testRead() throws IOException {
     UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
-		assertNotNull(dictionary);		
-	}
+    assertNotNull(dictionary);		
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java Tue Jan  3 04:33:56 2012
@@ -27,76 +27,76 @@ import org.apache.lucene.util.LuceneTest
 import org.junit.Test;
 
 public class DoubleArrayTrieTest extends LuceneTestCase {
-
-	@Test
-	public void testBuild() {		
-		Trie trie = getTrie();
-		DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
-		doubleArrayTrie.build(trie);
-	}
-
-	@Test
-	public void testWrite() throws IOException {
-		Trie trie = getTrie();
-		
-		DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
-		doubleArrayTrie.build(trie);
-		
-		try{
-			doubleArrayTrie.write("/some/path/which/is/not/exist");
-			fail();
-		}catch(IOException e){
-			
-		}
-		
-		// nocommit: lets use TEMPDIR here
-		String tmpDir = System.getProperty("java.io.tmpdir");
-		File dir = new File(tmpDir + File.separator + "datmp");
-		dir.mkdir();
-		doubleArrayTrie.write(dir.getCanonicalPath());
-		dir.deleteOnExit();
-		for(File file : dir.listFiles()) {
-			file.deleteOnExit();
-		}
-		
-		assertTrue(dir.length() > 0);
-		
-	}
-
-	@Test
-	public void testLookup() throws IOException {
-		Trie trie = getTrie();
-		
-		DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
-		doubleArrayTrie.build(trie);
-
+  
+  @Test
+  public void testBuild() {		
+    Trie trie = getTrie();
+    DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
+    doubleArrayTrie.build(trie);
+  }
+  
+  @Test
+  public void testWrite() throws IOException {
+    Trie trie = getTrie();
+    
+    DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
+    doubleArrayTrie.build(trie);
+    
+    try{
+      doubleArrayTrie.write("/some/path/which/is/not/exist");
+      fail();
+    }catch(IOException e){
+      
+    }
+    
     // nocommit: lets use TEMPDIR here
-		String tmpDir = System.getProperty("java.io.tmpdir");
-		File dir = new File(tmpDir + File.separator + "datmp");
-		dir.mkdir();
-		doubleArrayTrie.write(dir.getCanonicalPath());
-		dir.deleteOnExit();
-		for(File file : dir.listFiles()) {
-			file.deleteOnExit();
-		}
-
-		doubleArrayTrie = DoubleArrayTrie.read(new FileInputStream(dir.getCanonicalPath() + File.separator + DoubleArrayTrie.FILENAME));
-		
-		assertEquals(0, doubleArrayTrie.lookup("a"));
-		assertTrue(doubleArrayTrie.lookup("abc") > 0);
-		assertTrue(doubleArrayTrie.lookup("あいう") > 0);
-		assertTrue(doubleArrayTrie.lookup("xyz") < 0);
-
-	}
-	
-	private Trie getTrie() {
-		Trie trie = new Trie();
-		trie.add("abc");
-		trie.add("abd");
-		trie.add("あああ");
-		trie.add("あいう");
-		return trie;
-	}
-	
-
+    String tmpDir = System.getProperty("java.io.tmpdir");
+    File dir = new File(tmpDir + File.separator + "datmp");
+    dir.mkdir();
+    doubleArrayTrie.write(dir.getCanonicalPath());
+    dir.deleteOnExit();
+    for(File file : dir.listFiles()) {
+      file.deleteOnExit();
+    }
+    
+    assertTrue(dir.length() > 0);
+    
+  }
+  
+  @Test
+  public void testLookup() throws IOException {
+    Trie trie = getTrie();
+    
+    DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
+    doubleArrayTrie.build(trie);
+    
+    // nocommit: lets use TEMPDIR here
+    String tmpDir = System.getProperty("java.io.tmpdir");
+    File dir = new File(tmpDir + File.separator + "datmp");
+    dir.mkdir();
+    doubleArrayTrie.write(dir.getCanonicalPath());
+    dir.deleteOnExit();
+    for(File file : dir.listFiles()) {
+      file.deleteOnExit();
+    }
+    
+    doubleArrayTrie = DoubleArrayTrie.read(new FileInputStream(dir.getCanonicalPath() + File.separator + DoubleArrayTrie.FILENAME));
+    
+    assertEquals(0, doubleArrayTrie.lookup("a"));
+    assertTrue(doubleArrayTrie.lookup("abc") > 0);
+    assertTrue(doubleArrayTrie.lookup("あいう") > 0);
+    assertTrue(doubleArrayTrie.lookup("xyz") < 0);
+    
+  }
+  
+  private Trie getTrie() {
+    Trie trie = new Trie();
+    trie.add("abc");
+    trie.add("abd");
+    trie.add("あああ");
+    trie.add("あいう");
+    return trie;
+  }
+  
+  
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java Tue Jan  3 04:33:56 2012
@@ -23,137 +23,137 @@ import org.apache.lucene.util.LuceneTest
 import org.junit.Test;
 
 public class NodeTest extends LuceneTestCase {
-
-	@Test
-	public void testNode() {
-		Trie trie = new Trie();
-		
-		Node node = trie.new Node('!');
-		assertEquals('!', node.getKey());
-
-		node = trie.new Node('1');
-		assertEquals('1', node.getKey());
-
-		node = trie.new Node('a');
-		assertEquals('a', node.getKey());
-		
-		node = trie.new Node('!');
-		assertEquals('!', node.getKey());
-
-		node = trie.new Node('1');
-		assertEquals('1', node.getKey());
-
-		node = trie.new Node('あ');
-		assertEquals('あ', node.getKey());
-
-		node = trie.new Node('æ¼¢');
-		assertEquals('æ¼¢', node.getKey());		
-		
-	}
-
-	@Test
-	public void testAddChild() {
-		Trie trie = new Trie();
-		Node node = trie.new Node('a');
-
-		Node returnedNode = node.addChild(trie.new Node('b'));
-		assertEquals('b', returnedNode.getKey());
-		assertEquals(1, node.getChildren().length);		
-		assertEquals('b', node.getChildren()[0].getKey());
-		
-		returnedNode = node.addChild(trie.new Node('c'));
-		assertEquals('c', returnedNode.getKey());		
-		assertEquals(2, node.getChildren().length);		
-		assertEquals('c', node.getChildren()[1].getKey());
-	}
-
-	@Test
-	public void testAdd() {
-		Trie trie = new Trie();
-
-		Node node = trie.new Node('a');
-		node.add("");
-		assertEquals(0, node.getChildren().length);
-		
-		node = trie.new Node('a');
-		node.add("b");
-		assertEquals(1, node.getChildren().length);
-		assertEquals('b', node.getChildren()[0].getKey());		
-
-		node = trie.new Node('a');
-		node.add("bc");
-		Node b = node.getChildren()[0];
-		assertEquals(1, node.getChildren().length);
-		assertEquals('b', b.getKey());		
-		assertEquals(1, b.getChildren().length);
-		Node c = b.getChildren()[0];
-		assertEquals('c', c.getKey());		
-		assertEquals(0, c.getChildren().length);
-
-		node.add("bd");
-		b = node.getChildren()[0];
-		assertEquals(1, node.getChildren().length);
-		assertEquals('b', b.getKey());
-		assertEquals(2, b.getChildren().length);
-		c = b.getChildren()[0];
-		assertEquals('c', c.getKey());		
-		assertEquals(0, c.getChildren().length);
-		Node d = b.getChildren()[1];
-		assertEquals('d', d.getKey());		
-		assertEquals(0, d.getChildren().length);
-	}
-	
-	
-	@Test
-	public void testGetkey() {
-		Trie trie = new Trie();
-
-		Node node = trie.new Node('!');
-		assertEquals('!', node.getKey());
-
-		node = trie.new Node('1');
-		assertEquals('1', node.getKey());
-
-		node = trie.new Node('a');
-		assertEquals('a', node.getKey());
-		
-		node = trie.new Node('!');
-		assertEquals('!', node.getKey());
-
-		node = trie.new Node('1');
-		assertEquals('1', node.getKey());
-
-		node = trie.new Node('あ');
-		assertEquals('あ', node.getKey());
-
-		node = trie.new Node('æ¼¢');
-		assertEquals('æ¼¢', node.getKey());		
-	}
-	
-	@Test
-	public void testHasSinglePath() {
-		Trie trie = new Trie();
-
-		Node node = trie.new Node('a');
-		node.add("bcd");
-		assertEquals(true, node.hasSinglePath());
-		
-		node.add("bce");
-		assertEquals(false, node.hasSinglePath());
-	}
-	
-	@Test
-	public void testGetChildren() {
-		Trie trie = new Trie();
-
-		Node node = trie.new Node('a');
-		node.add("bcd");
-		node.add("bde");
-		node.add("xyz");
-		
-		assertEquals(2, node.getChildren().length);
-		assertEquals('b', node.getChildren()[0].getKey());
-		assertEquals('x', node.getChildren()[1].getKey());
-		
-	}
+  
+  @Test
+  public void testNode() {
+    Trie trie = new Trie();
+    
+    Node node = trie.new Node('!');
+    assertEquals('!', node.getKey());
+    
+    node = trie.new Node('1');
+    assertEquals('1', node.getKey());
+    
+    node = trie.new Node('a');
+    assertEquals('a', node.getKey());
+    
+    node = trie.new Node('!');
+    assertEquals('!', node.getKey());
+    
+    node = trie.new Node('1');
+    assertEquals('1', node.getKey());
+    
+    node = trie.new Node('あ');
+    assertEquals('あ', node.getKey());
+    
+    node = trie.new Node('æ¼¢');
+    assertEquals('æ¼¢', node.getKey());		
+    
+  }
+  
+  @Test
+  public void testAddChild() {
+    Trie trie = new Trie();
+    Node node = trie.new Node('a');
+    
+    Node returnedNode = node.addChild(trie.new Node('b'));
+    assertEquals('b', returnedNode.getKey());
+    assertEquals(1, node.getChildren().length);		
+    assertEquals('b', node.getChildren()[0].getKey());
+    
+    returnedNode = node.addChild(trie.new Node('c'));
+    assertEquals('c', returnedNode.getKey());		
+    assertEquals(2, node.getChildren().length);		
+    assertEquals('c', node.getChildren()[1].getKey());
+  }
+  
+  @Test
+  public void testAdd() {
+    Trie trie = new Trie();
+    
+    Node node = trie.new Node('a');
+    node.add("");
+    assertEquals(0, node.getChildren().length);
+    
+    node = trie.new Node('a');
+    node.add("b");
+    assertEquals(1, node.getChildren().length);
+    assertEquals('b', node.getChildren()[0].getKey());		
+    
+    node = trie.new Node('a');
+    node.add("bc");
+    Node b = node.getChildren()[0];
+    assertEquals(1, node.getChildren().length);
+    assertEquals('b', b.getKey());		
+    assertEquals(1, b.getChildren().length);
+    Node c = b.getChildren()[0];
+    assertEquals('c', c.getKey());		
+    assertEquals(0, c.getChildren().length);
+    
+    node.add("bd");
+    b = node.getChildren()[0];
+    assertEquals(1, node.getChildren().length);
+    assertEquals('b', b.getKey());
+    assertEquals(2, b.getChildren().length);
+    c = b.getChildren()[0];
+    assertEquals('c', c.getKey());		
+    assertEquals(0, c.getChildren().length);
+    Node d = b.getChildren()[1];
+    assertEquals('d', d.getKey());		
+    assertEquals(0, d.getChildren().length);
+  }
+  
+  
+  @Test
+  public void testGetkey() {
+    Trie trie = new Trie();
+    
+    Node node = trie.new Node('!');
+    assertEquals('!', node.getKey());
+    
+    node = trie.new Node('1');
+    assertEquals('1', node.getKey());
+    
+    node = trie.new Node('a');
+    assertEquals('a', node.getKey());
+    
+    node = trie.new Node('!');
+    assertEquals('!', node.getKey());
+    
+    node = trie.new Node('1');
+    assertEquals('1', node.getKey());
+    
+    node = trie.new Node('あ');
+    assertEquals('あ', node.getKey());
+    
+    node = trie.new Node('æ¼¢');
+    assertEquals('æ¼¢', node.getKey());		
+  }
+  
+  @Test
+  public void testHasSinglePath() {
+    Trie trie = new Trie();
+    
+    Node node = trie.new Node('a');
+    node.add("bcd");
+    assertEquals(true, node.hasSinglePath());
+    
+    node.add("bce");
+    assertEquals(false, node.hasSinglePath());
+  }
+  
+  @Test
+  public void testGetChildren() {
+    Trie trie = new Trie();
+    
+    Node node = trie.new Node('a');
+    node.add("bcd");
+    node.add("bde");
+    node.add("xyz");
+    
+    assertEquals(2, node.getChildren().length);
+    assertEquals('b', node.getChildren()[0].getKey());
+    assertEquals('x', node.getChildren()[1].getKey());
+    
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java Tue Jan  3 04:33:56 2012
@@ -23,50 +23,50 @@ import org.apache.lucene.util.LuceneTest
 import org.junit.Test;
 
 public class TrieTest extends LuceneTestCase {
-	
-	@Test
-	public void testGetRoot() {
-		Trie trie = new Trie();
-		Node rootNode = trie.getRoot();
-		assertNotNull(rootNode);
-	}
-	
-	@Test
-	public void testAdd() {
-		Trie trie = new Trie();
-		trie.add("aa");
-		trie.add("ab");
-		trie.add("bb");
-		
-		Node rootNode = trie.getRoot();
-		assertEquals(2, rootNode.getChildren().length);
-		assertEquals(2, rootNode.getChildren()[0].getChildren().length);
-		assertEquals(1, rootNode.getChildren()[1].getChildren().length);
-	}
-	
-	@Test
-	public void testGetChildren() {
-		Trie trie = new Trie();
-		trie.add("aa");
-		trie.add("ab");
-		trie.add("bb");
-		
-		Node rootNode = trie.getRoot();
-		assertEquals(2, rootNode.getChildren().length);
-		assertEquals(2, rootNode.getChildren()[0].getChildren().length);
-		assertEquals(1, rootNode.getChildren()[1].getChildren().length);
-	}
-	
-	@Test
-	public void testSinglePath() {
-		Trie trie = new Trie();
-		assertTrue(trie.getRoot().hasSinglePath());
-		trie.add("abcdef");
-		assertTrue(trie.getRoot().hasSinglePath());
-		trie.add("abdfg");
-		Node rootNode = trie.getRoot();
-		assertEquals(2, rootNode.getChildren()[0].getChildren()[0].getChildren().length);
-		assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[0].hasSinglePath());
-		assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[1].hasSinglePath());
-	}
+  
+  @Test
+  public void testGetRoot() {
+    Trie trie = new Trie();
+    Node rootNode = trie.getRoot();
+    assertNotNull(rootNode);
+  }
+  
+  @Test
+  public void testAdd() {
+    Trie trie = new Trie();
+    trie.add("aa");
+    trie.add("ab");
+    trie.add("bb");
+    
+    Node rootNode = trie.getRoot();
+    assertEquals(2, rootNode.getChildren().length);
+    assertEquals(2, rootNode.getChildren()[0].getChildren().length);
+    assertEquals(1, rootNode.getChildren()[1].getChildren().length);
+  }
+  
+  @Test
+  public void testGetChildren() {
+    Trie trie = new Trie();
+    trie.add("aa");
+    trie.add("ab");
+    trie.add("bb");
+    
+    Node rootNode = trie.getRoot();
+    assertEquals(2, rootNode.getChildren().length);
+    assertEquals(2, rootNode.getChildren()[0].getChildren().length);
+    assertEquals(1, rootNode.getChildren()[1].getChildren().length);
+  }
+  
+  @Test
+  public void testSinglePath() {
+    Trie trie = new Trie();
+    assertTrue(trie.getRoot().hasSinglePath());
+    trie.add("abcdef");
+    assertTrue(trie.getRoot().hasSinglePath());
+    trie.add("abdfg");
+    Node rootNode = trie.getRoot();
+    assertEquals(2, rootNode.getChildren()[0].getChildren()[0].getChildren().length);
+    assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[0].hasSinglePath());
+    assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[1].hasSinglePath());
+  }
 }