You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:33:57 UTC

svn commit: r1226637 [2/3] - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucen...

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java Tue Jan  3 04:33:56 2012
@@ -33,285 +33,285 @@ import java.nio.channels.ReadableByteCha
 import org.apache.lucene.analysis.kuromoji.trie.Trie.Node;
 
 public class DoubleArrayTrie {
-	
-	public static final String FILENAME = "dat.dat";
-
-	public static final char TERMINATING_CHARACTER = '\u0001';
-
-	private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
-
-	private static final int TAIL_INITIAL_SIZE = 10000;
-	
-	private static final int TAIL_OFFSET = 10000000;
-	
-	private IntBuffer baseBuffer;
-	
-	private IntBuffer checkBuffer;
-	
-	private CharBuffer tailBuffer;
-	
-	private int tailIndex = TAIL_OFFSET;
-	
-
-	public DoubleArrayTrie(){
-	}
-
-	/**
-	 * Write to file
-	 * @param filename filename
-	 * @throws IOException
-	 */
-	public void write(String directoryname) throws IOException  {
-		String filename = directoryname + File.separator + FILENAME;
-
-		baseBuffer.rewind();
-		checkBuffer.rewind();
-		tailBuffer.rewind();
-		
-		File file = new File(filename);
-		if(file.exists()){
-			file.delete();
-		}
-		
-		RandomAccessFile raf = new RandomAccessFile(filename, "rw");
-		FileChannel channel = raf.getChannel();
-		raf.writeInt(baseBuffer.capacity());
-		raf.writeInt(tailBuffer.capacity());		
-
-		ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
-		IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
-		tmpIntBuffer.put(baseBuffer);
-		tmpBuffer.rewind();
-		channel.write(tmpBuffer);
-		
-		tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
-		tmpIntBuffer = tmpBuffer.asIntBuffer();
-		tmpIntBuffer.put(checkBuffer);
-		tmpBuffer.rewind();
-		channel.write(tmpBuffer);
-		
-		tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
-		CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
-		tmpCharBuffer.put(tailBuffer);
-		tmpBuffer.rewind();
-		channel.write(tmpBuffer);
-		
-		raf.close();
-	}
-	
-	public static DoubleArrayTrie getInstance() throws IOException {
-		InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(FILENAME);
-		return read(is);
-	}
-	
-	/**
-	 * Load Stored data
-	 * @param is
-	 * @return
-	 * @throws IOException
-	 */
-	public static DoubleArrayTrie read(InputStream is) throws IOException {
-		DoubleArrayTrie trie = new DoubleArrayTrie();
-		DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
-		int baseCheckSize = dis.readInt();	// Read size of baseArr and checkArr
-		int tailSize = dis.readInt();		// Read size of tailArr
-		ReadableByteChannel channel = Channels.newChannel(dis);
-
-
-		ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);	// The size is 4 times the baseCheckSize since it is the length of array
-		channel.read(tmpBaseBuffer);
-		tmpBaseBuffer.rewind();
-		trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
-
-		ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
-		channel.read(tmpCheckBuffer);
-		tmpCheckBuffer.rewind();
-		trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
-
-		ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2);			// The size is 2 times the tailSize since it is the length of array
-		channel.read(tmpTailBuffer);
-		tmpTailBuffer.rewind();
-		trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
-
-		is.close();
-		return trie;
-	}
-	
-	/**
-	 * Construct double array trie which is equivalent to input trie
-	 * @param trie normal trie which contains all dictionary words
-	 */
-	public void build(Trie trie) {
-		baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
-		baseBuffer.put(0, 1);
-		checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
-		tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
-		add(-1, 0, trie.getRoot());
-	}
-	
-	/**
-	 * Add Node(character) to double array trie
-	 * @param previous
-	 * @param index
-	 * @param node
-	 */
-	private void add(int previous, int index, Node node) {
-		Node[] children = node.getChildren();	// nodes following current node
-
-		if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) {	// If node has only one path, put the rest in tail array
-			baseBuffer.put(index, tailIndex);	// current index of tail array
-			addToTail(node.children[0]);
-			checkBuffer.put(index, previous);
-			return;	// No more child to process
-		}
-
-		int base = findBase(index, children);	// Get base value for current index
-		baseBuffer.put(index, base);
-		
-		if(previous >= 0){
-			checkBuffer.put(index, previous);	// Set check value
-		}
-				
-		for(Trie.Node child : children) {	// For each child to double array trie
-			add(index, index + base + child.getKey(), child);
-		}
-		
-	}
-	
-	/**
-	 * Match input keyword.
-	 * @param key key to match
-	 * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
-	 */
-	public int lookup(String key) {
-		int index = 0;
-		int base = 1; // base at index 0 should be 1
-
-		int keyLength = key.length();
-		for(int i = 0; i < keyLength; i++) {
-			int previous = index;
-			index = index + base + key.charAt(i);
-			
-			if(index > baseBuffer.limit()) { // Too long
-				return -1;
-			}
-			
-			base = baseBuffer.get(index);
-			
-			if (base == 0 ) { // Didn't find match
-				return -1;
-			}
-
-			if(checkBuffer.get(index) != previous){	// check doesn't match
-				return -1;
-			}
-
-			if(base >= TAIL_OFFSET) {	// If base is bigger than TAIL_OFFSET, start processing "tail"
-				return matchTail(base, index, key.substring(i + 1));
-			}
-
-		}
-
-		// If we reach at the end of input keyword, check if it is complete match by looking for following terminating character		
-		int endIndex = index + base + TERMINATING_CHARACTER;
-		
-		return checkBuffer.get(endIndex) == index ? index : 0;
-	}
-
-	/**
-	 * Check match in tail array
-	 * @param base
-	 * @param index
-	 * @param key
-	 * @return	index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
-	 */
-	private int matchTail(int base, int index, String key) {
-		int positionInTailArr = base - TAIL_OFFSET;
-		
-		int keyLength = key.length();
-		for(int i = 0; i < keyLength; i++) {
-			if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
-				return -1;
-			}
-		}
-		return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
-		
-	}
-
-	/**
-	 * Find base value for current node, which contains input nodes. They are children of current node.
-	 * Set default base value , which is one, at the index of each input node.
-	 * @param index
-	 * @param nodes
-	 * @return	base value for current node
-	 */
-	private int findBase(int index, Node[] nodes){
-		int base = baseBuffer.get(index);
-		if(base < 0) {
-			return base;
-		}
-		
-		while(true) {
-			boolean collision = false;	// already taken?
-			for(Node node : nodes) {
-				/*
-				 * NOTE:
-				 * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
-				 * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
-				 * Although the size of jar file doesn't change, memory consumption will be smaller.
-				 */
-				int nextIndex = index + base + node.getKey();
-				
-				if(baseBuffer.capacity() <= nextIndex) {
-					int newLength = nextIndex + 1;
-					IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
-					baseBuffer.rewind();
-					newBaseBuffer.put(baseBuffer);
-					baseBuffer = newBaseBuffer;
-					IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
-					checkBuffer.rewind();
-					newCheckBuffer.put(checkBuffer);
-					checkBuffer = newCheckBuffer;
-				}
-				
-				if(baseBuffer.get(nextIndex) != 0) {	// already taken
-					base++;	// check next base value
-					collision = true;
-					break;
-				}
-			}
-			
-			if(!collision){
-				break;	// if there is no collision, found proper base value. Break the while loop.
-			}
-			
-		}
-
-		for(Node node : nodes) {
-			baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1);	// Set -1 if key is terminating character. Set default base value 1 if not.
-		}
-
-		return base;
-	}
-	
-	/**
-	 * Add characters(nodes) to tail array
-	 * @param node
-	 */
-	private void addToTail(Node node) {
-		while(true) {
-			if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
-				CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
-				tailBuffer.rewind();
-				newTailBuffer.put(tailBuffer);
-				tailBuffer = newTailBuffer;
-			}
-			tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
-
-			if(node.getChildren().length == 0) {	// if it reached the end of input, break.
-				break;
-			}
-			node = node.getChildren()[0];	// Move to next node
-		}
-	}
+  
+  public static final String FILENAME = "dat.dat";
+  
+  public static final char TERMINATING_CHARACTER = '\u0001';
+  
+  private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
+  
+  private static final int TAIL_INITIAL_SIZE = 10000;
+  
+  private static final int TAIL_OFFSET = 10000000;
+  
+  private IntBuffer baseBuffer;
+  
+  private IntBuffer checkBuffer;
+  
+  private CharBuffer tailBuffer;
+  
+  private int tailIndex = TAIL_OFFSET;
+  
+  
+  public DoubleArrayTrie(){
+  }
+  
+  /**
+   * Write to file
+   * @param filename filename
+   * @throws IOException
+   */
+  public void write(String directoryname) throws IOException  {
+    String filename = directoryname + File.separator + FILENAME;
+    
+    baseBuffer.rewind();
+    checkBuffer.rewind();
+    tailBuffer.rewind();
+    
+    File file = new File(filename);
+    if(file.exists()){
+      file.delete();
+    }
+    
+    RandomAccessFile raf = new RandomAccessFile(filename, "rw");
+    FileChannel channel = raf.getChannel();
+    raf.writeInt(baseBuffer.capacity());
+    raf.writeInt(tailBuffer.capacity());		
+    
+    ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
+    IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
+    tmpIntBuffer.put(baseBuffer);
+    tmpBuffer.rewind();
+    channel.write(tmpBuffer);
+    
+    tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
+    tmpIntBuffer = tmpBuffer.asIntBuffer();
+    tmpIntBuffer.put(checkBuffer);
+    tmpBuffer.rewind();
+    channel.write(tmpBuffer);
+    
+    tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
+    CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
+    tmpCharBuffer.put(tailBuffer);
+    tmpBuffer.rewind();
+    channel.write(tmpBuffer);
+    
+    raf.close();
+  }
+  
+  public static DoubleArrayTrie getInstance() throws IOException {
+    InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(FILENAME);
+    return read(is);
+  }
+  
+  /**
+   * Load Stored data
+   * @param is
+   * @return
+   * @throws IOException
+   */
+  public static DoubleArrayTrie read(InputStream is) throws IOException {
+    DoubleArrayTrie trie = new DoubleArrayTrie();
+    DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
+    int baseCheckSize = dis.readInt();	// Read size of baseArr and checkArr
+    int tailSize = dis.readInt();		// Read size of tailArr
+    ReadableByteChannel channel = Channels.newChannel(dis);
+    
+    
+    ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);	// The size is 4 times the baseCheckSize since it is the length of array
+    channel.read(tmpBaseBuffer);
+    tmpBaseBuffer.rewind();
+    trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
+    
+    ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
+    channel.read(tmpCheckBuffer);
+    tmpCheckBuffer.rewind();
+    trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
+    
+    ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2);			// The size is 2 times the tailSize since it is the length of array
+    channel.read(tmpTailBuffer);
+    tmpTailBuffer.rewind();
+    trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
+    
+    is.close();
+    return trie;
+  }
+  
+  /**
+   * Construct double array trie which is equivalent to input trie
+   * @param trie normal trie which contains all dictionary words
+   */
+  public void build(Trie trie) {
+    baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+    baseBuffer.put(0, 1);
+    checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+    tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
+    add(-1, 0, trie.getRoot());
+  }
+  
+  /**
+   * Add Node(character) to double array trie
+   * @param previous
+   * @param index
+   * @param node
+   */
+  private void add(int previous, int index, Node node) {
+    Node[] children = node.getChildren();	// nodes following current node
+    
+    if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) {	// If node has only one path, put the rest in tail array
+      baseBuffer.put(index, tailIndex);	// current index of tail array
+      addToTail(node.children[0]);
+      checkBuffer.put(index, previous);
+      return;	// No more child to process
+    }
+    
+    int base = findBase(index, children);	// Get base value for current index
+    baseBuffer.put(index, base);
+    
+    if(previous >= 0){
+      checkBuffer.put(index, previous);	// Set check value
+    }
+    
+    for(Trie.Node child : children) {	// For each child to double array trie
+      add(index, index + base + child.getKey(), child);
+    }
+    
+  }
+  
+  /**
+   * Match input keyword.
+   * @param key key to match
+   * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
+   */
+  public int lookup(String key) {
+    int index = 0;
+    int base = 1; // base at index 0 should be 1
+    
+    int keyLength = key.length();
+    for(int i = 0; i < keyLength; i++) {
+      int previous = index;
+      index = index + base + key.charAt(i);
+      
+      if(index > baseBuffer.limit()) { // Too long
+        return -1;
+      }
+      
+      base = baseBuffer.get(index);
+      
+      if (base == 0 ) { // Didn't find match
+        return -1;
+      }
+      
+      if(checkBuffer.get(index) != previous){	// check doesn't match
+        return -1;
+      }
+      
+      if(base >= TAIL_OFFSET) {	// If base is bigger than TAIL_OFFSET, start processing "tail"
+        return matchTail(base, index, key.substring(i + 1));
+      }
+      
+    }
+    
+    // If we reach at the end of input keyword, check if it is complete match by looking for following terminating character		
+    int endIndex = index + base + TERMINATING_CHARACTER;
+    
+    return checkBuffer.get(endIndex) == index ? index : 0;
+  }
+  
+  /**
+   * Check match in tail array
+   * @param base
+   * @param index
+   * @param key
+   * @return	index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
+   */
+  private int matchTail(int base, int index, String key) {
+    int positionInTailArr = base - TAIL_OFFSET;
+    
+    int keyLength = key.length();
+    for(int i = 0; i < keyLength; i++) {
+      if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
+        return -1;
+      }
+    }
+    return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
+    
+  }
+  
+  /**
+   * Find base value for current node, which contains input nodes. They are children of current node.
+   * Set default base value , which is one, at the index of each input node.
+   * @param index
+   * @param nodes
+   * @return	base value for current node
+   */
+  private int findBase(int index, Node[] nodes){
+    int base = baseBuffer.get(index);
+    if(base < 0) {
+      return base;
+    }
+    
+    while(true) {
+      boolean collision = false;	// already taken?
+      for(Node node : nodes) {
+        /*
+         * NOTE:
+         * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
+         * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
+         * Although the size of jar file doesn't change, memory consumption will be smaller.
+         */
+        int nextIndex = index + base + node.getKey();
+        
+        if(baseBuffer.capacity() <= nextIndex) {
+          int newLength = nextIndex + 1;
+          IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+          baseBuffer.rewind();
+          newBaseBuffer.put(baseBuffer);
+          baseBuffer = newBaseBuffer;
+          IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+          checkBuffer.rewind();
+          newCheckBuffer.put(checkBuffer);
+          checkBuffer = newCheckBuffer;
+        }
+        
+        if(baseBuffer.get(nextIndex) != 0) {	// already taken
+          base++;	// check next base value
+          collision = true;
+          break;
+        }
+      }
+      
+      if(!collision){
+        break;	// if there is no collision, found proper base value. Break the while loop.
+      }
+      
+    }
+    
+    for(Node node : nodes) {
+      baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1);	// Set -1 if key is terminating character. Set default base value 1 if not.
+    }
+    
+    return base;
+  }
+  
+  /**
+   * Add characters(nodes) to tail array
+   * @param node
+   */
+  private void addToTail(Node node) {
+    while(true) {
+      if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
+        CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
+        tailBuffer.rewind();
+        newTailBuffer.put(tailBuffer);
+        tailBuffer = newTailBuffer;
+      }
+      tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
+      
+      if(node.getChildren().length == 0) {	// if it reached the end of input, break.
+        break;
+      }
+      node = node.getChildren()[0];	// Move to next node
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java Tue Jan  3 04:33:56 2012
@@ -18,131 +18,131 @@ package org.apache.lucene.analysis.kurom
  */
 
 public class Trie {
-	
-	private Node root;	// Root node of Trie
-
-	/**
-	 * Constructor
-	 * Initialize Trie with empty root node
-	 */
-	public Trie() {
-		root = new Node();
-	}
-
-	/**
-	 * Add input value into Trie
-	 * Before adding, it adds terminating character(\u0001) to input string
-	 * @param value String to add to Trie
-	 */
-	public void add(String value) {
-		root.add(value + DoubleArrayTrie.TERMINATING_CHARACTER);
-	}
-
-	/**
-	 * Return root node which contains other nodes
-	 * @return	Node
-	 */
-	public Node getRoot() {
-		return root;
-	}
-
-	/**
-	 * Trie Node
-	 */
-	public class Node {
-		char key;						// key(char) of this node
-		
-		Node[] children = new Node[0];	// Array to hold children nodes
-
-		/**
-		 * Constructor
-		 */
-		public Node() {
-		}
-
-		/**
-		 * Constructor
-		 * @param key key for this node
-		 */
-		public Node(char key) {
-			this.key = key;
-		}
-
-		/**
-		 * Add string to Trie
-		 * @param value String to add
-		 */
-		public void add(String value) {
-			if (value.length() == 0) {
-				return;
-			}
-			
-			Node node = new Node(value.charAt(0));
-			addChild(node).add(value.substring(1));
-		}
-
-		/**
-		 * Add Node to this node as child
-		 * @param newNode node to add
-		 * @return added node. If a node with same key already exists, return that node.
-		 */
-		public Node addChild(Node newNode) {
-			Node child = getChild(newNode.getKey());
-			if (child == null) {
-				Node[] newChildren = new Node[children.length + 1];
-				System.arraycopy(children, 0, newChildren, 0, children.length);
-				newChildren[newChildren.length -1] = newNode;
-				children = newChildren;
-				child = newNode;
-			}
-			return child;
-		}
-
-		/**
-		 * Return the key of the node
-		 * @return key
-		 */
-		public char getKey() {
-			return key;
-		}
-		
-		/**
-		 * Check if children following this node has only single path.
-		 * For example, if you have "abcde" and "abfgh" in Trie, calling this method on node "a" and "b" returns false.
-		 * Calling this method on "c", "d", "e", "f", "g" and "h" returns true.
-		 * @return true if it has only single path. false if it has multiple path.
-		 */
-		public boolean hasSinglePath() {
-			switch(children.length){
-			case 0:
-				return true;
-			case 1:
-				return children[0].hasSinglePath();
-			default:
-				return false;
-			}
-		}
-
-		/**
-		 * Return children node
-		 * @return Array of children nodes
-		 */
-		public Node[] getChildren() {
-			return children;
-		}
-
-		/**
-		 * Return node which has input key
-		 * @param key key to look for
-		 * @return node which has input key. null if it doesn't exist.
-		 */
-		private Node getChild(char key) {
-			for (Node child : children) {
-				if (child.getKey() == key) {
-					return child;
-				}
-			}
-			return null;
-		}
-	}
+  
+  private Node root;	// Root node of Trie
+  
+  /**
+   * Constructor
+   * Initialize Trie with empty root node
+   */
+  public Trie() {
+    root = new Node();
+  }
+  
+  /**
+   * Add input value into Trie
+   * Before adding, it adds terminating character(\u0001) to input string
+   * @param value String to add to Trie
+   */
+  public void add(String value) {
+    root.add(value + DoubleArrayTrie.TERMINATING_CHARACTER);
+  }
+  
+  /**
+   * Return root node which contains other nodes
+   * @return	Node
+   */
+  public Node getRoot() {
+    return root;
+  }
+  
+  /**
+   * Trie Node
+   */
+  public class Node {
+    char key;						// key(char) of this node
+    
+    Node[] children = new Node[0];	// Array to hold children nodes
+    
+    /**
+     * Constructor
+     */
+    public Node() {
+    }
+    
+    /**
+     * Constructor
+     * @param key key for this node
+     */
+    public Node(char key) {
+      this.key = key;
+    }
+    
+    /**
+     * Add string to Trie
+     * @param value String to add
+     */
+    public void add(String value) {
+      if (value.length() == 0) {
+        return;
+      }
+      
+      Node node = new Node(value.charAt(0));
+      addChild(node).add(value.substring(1));
+    }
+    
+    /**
+     * Add Node to this node as child
+     * @param newNode node to add
+     * @return added node. If a node with same key already exists, return that node.
+     */
+    public Node addChild(Node newNode) {
+      Node child = getChild(newNode.getKey());
+      if (child == null) {
+        Node[] newChildren = new Node[children.length + 1];
+        System.arraycopy(children, 0, newChildren, 0, children.length);
+        newChildren[newChildren.length -1] = newNode;
+        children = newChildren;
+        child = newNode;
+      }
+      return child;
+    }
+    
+    /**
+     * Return the key of the node
+     * @return key
+     */
+    public char getKey() {
+      return key;
+    }
+    
+    /**
+     * Check if children following this node has only single path.
+     * For example, if you have "abcde" and "abfgh" in Trie, calling this method on node "a" and "b" returns false.
+     * Calling this method on "c", "d", "e", "f", "g" and "h" returns true.
+     * @return true if it has only single path. false if it has multiple path.
+     */
+    public boolean hasSinglePath() {
+      switch(children.length){
+        case 0:
+          return true;
+        case 1:
+          return children[0].hasSinglePath();
+        default:
+          return false;
+      }
+    }
+    
+    /**
+     * Return children node
+     * @return Array of children nodes
+     */
+    public Node[] getChildren() {
+      return children;
+    }
+    
+    /**
+     * Return node which has input key
+     * @param key key to look for
+     * @return node which has input key. null if it doesn't exist.
+     */
+    private Node getChild(char key) {
+      for (Node child : children) {
+        if (child.getKey() == key) {
+          return child;
+        }
+      }
+      return null;
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java Tue Jan  3 04:33:56 2012
@@ -22,80 +22,80 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 public class CSVUtil {
-	private static final char QUOTE = '"';
-	
-	private static final char COMMA = ',';
-
-	private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
-
-	private static final String ESCAPED_QUOTE = "\"\"";
-	
-	/**
-	 * Parse CSV line
-	 * @param line
-	 * @return Array of values
-	 */
-	public static String[] parse(String line) {
-		boolean insideQuote = false;
-		ArrayList<String> result = new ArrayList<String>();		
-		int quoteCount = 0;
-		StringBuilder sb = new StringBuilder();
-		for(int i = 0; i < line.length(); i++) {
-			char c = line.charAt(i);
-
-			if(c == QUOTE) {
-				insideQuote = !insideQuote;
-				quoteCount++;
-			}
-			
-			if(c == COMMA && !insideQuote) {
-				String value = sb.toString();
-				value = unQuoteUnEscape(value);
-				result.add(value);
-				sb = new StringBuilder();
-				continue;
-			}
-			
-			sb.append(c);
-		}
-		
-		result.add(sb.toString());
-
-		// Validate
-		if(quoteCount % 2 != 0) {
-			return new String[0];
-		}
-		
-		return result.toArray(new String[result.size()]);
-	}
-	
-	private static String unQuoteUnEscape(String original) {
-		String result = original;
-		
-		// Unquote
-		Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
-		if(m.matches()) {
-			result = m.group(1);
-		}
-		
-		// Unescape
-		result = result.replaceAll(ESCAPED_QUOTE, "\"");
-		
-		return result;
-		
-	}
-	
-	/**
-	 * Quote and escape input value for CSV
-	 * @param original
-	 * @return
-	 */
-	public static String quoteEscape(String original) {
-		String result = original.replaceAll("\"", ESCAPED_QUOTE);
-		if(result.indexOf(COMMA) >= 0) {
-			result = "\"" + result + "\"";
-		}
-		return result;
-	}
-
+  private static final char QUOTE = '"';
+  
+  private static final char COMMA = ',';
+  
+  private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
+  
+  private static final String ESCAPED_QUOTE = "\"\"";
+  
+  /**
+   * Parse CSV line
+   * @param line
+   * @return Array of values
+   */
+  public static String[] parse(String line) {
+    boolean insideQuote = false;
+    ArrayList<String> result = new ArrayList<String>();		
+    int quoteCount = 0;
+    StringBuilder sb = new StringBuilder();
+    for(int i = 0; i < line.length(); i++) {
+      char c = line.charAt(i);
+      
+      if(c == QUOTE) {
+        insideQuote = !insideQuote;
+        quoteCount++;
+      }
+      
+      if(c == COMMA && !insideQuote) {
+        String value = sb.toString();
+        value = unQuoteUnEscape(value);
+        result.add(value);
+        sb = new StringBuilder();
+        continue;
+      }
+      
+      sb.append(c);
+    }
+    
+    result.add(sb.toString());
+    
+    // Validate
+    if(quoteCount % 2 != 0) {
+      return new String[0];
+    }
+    
+    return result.toArray(new String[result.size()]);
+  }
+  
+  private static String unQuoteUnEscape(String original) {
+    String result = original;
+    
+    // Unquote
+    Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
+    if(m.matches()) {
+      result = m.group(1);
+    }
+    
+    // Unescape
+    result = result.replaceAll(ESCAPED_QUOTE, "\"");
+    
+    return result;
+    
+  }
+  
+  /**
+   * Quote and escape input value for CSV
+   * @param original
+   * @return
+   */
+  public static String quoteEscape(String original) {
+    String result = original.replaceAll("\"", ESCAPED_QUOTE);
+    if(result.indexOf(COMMA) >= 0) {
+      result = "\"" + result + "\"";
+    }
+    return result;
+  }
+  
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java Tue Jan  3 04:33:56 2012
@@ -25,39 +25,39 @@ import java.io.LineNumberReader;
 import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
 
 public class ConnectionCostsBuilder {
-	
-	public ConnectionCostsBuilder() {
-		
-	}
-	
-	public static ConnectionCosts build(String filename) throws IOException {
-		FileInputStream inputStream = new FileInputStream(filename);
-		InputStreamReader streamReader = new InputStreamReader(inputStream);
-		LineNumberReader lineReader = new LineNumberReader(streamReader);
-
-		String line = lineReader.readLine();
-		String[] dimensions = line.split("\\s+");
-		
-		assert dimensions.length == 3;
-
-		int forwardSize = Integer.parseInt(dimensions[0]);
-		int backwardSize = Integer.parseInt(dimensions[1]);
-		
-		assert forwardSize > 0 && backwardSize > 0;
-		
-		ConnectionCosts costs = new ConnectionCosts(forwardSize, backwardSize);
-		
-		while ((line = lineReader.readLine()) != null) {
-			String[] fields = line.split("\\s+");
-
-			assert fields.length == 3;
-			
-			int forwardId = Integer.parseInt(fields[0]);
-			int backwardId = Integer.parseInt(fields[1]);
-			int cost = Integer.parseInt(fields[2]);
-
-			costs.add(forwardId, backwardId, cost);
-		}
-		return costs;
-	}
+  
+  public ConnectionCostsBuilder() {
+    
+  }
+  
+  public static ConnectionCosts build(String filename) throws IOException {
+    FileInputStream inputStream = new FileInputStream(filename);
+    InputStreamReader streamReader = new InputStreamReader(inputStream);
+    LineNumberReader lineReader = new LineNumberReader(streamReader);
+    
+    String line = lineReader.readLine();
+    String[] dimensions = line.split("\\s+");
+    
+    assert dimensions.length == 3;
+    
+    int forwardSize = Integer.parseInt(dimensions[0]);
+    int backwardSize = Integer.parseInt(dimensions[1]);
+    
+    assert forwardSize > 0 && backwardSize > 0;
+    
+    ConnectionCosts costs = new ConnectionCosts(forwardSize, backwardSize);
+    
+    while ((line = lineReader.readLine()) != null) {
+      String[] fields = line.split("\\s+");
+      
+      assert fields.length == 3;
+      
+      int forwardId = Integer.parseInt(fields[0]);
+      int backwardId = Integer.parseInt(fields[1]);
+      int cost = Integer.parseInt(fields[2]);
+      
+      costs.add(forwardId, backwardId, cost);
+    }
+    return costs;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java Tue Jan  3 04:33:56 2012
@@ -27,81 +27,81 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
 
 public class DictionaryBuilder {
-	
-	public enum DictionaryFormat { IPADIC, UNIDIC };
-	
-	public DictionaryBuilder() {
-		
-	}
-	
-	public void build(DictionaryFormat format,
-					  String inputDirname,
-					  String outputDirname,
-					  String encoding,
-					  boolean normalizeEntry) throws IOException {
-		System.out.println("building tokeninfo dict...");
-		TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
-		TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
-
-		System.out.print("  building double array trie...");
-		DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet());
-		trie.write(outputDirname);
-		System.out.println("  done");
-
-		System.out.print("  processing target map...");
-		for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
-			int tokenInfoId = entry.getKey();
-			String surfaceform = entry.getValue();
-			int doubleArrayId = trie.lookup(surfaceform);
-			assert doubleArrayId > 0;
-			tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
-		}		
-		tokenInfoDictionary.write(outputDirname);
-		trie = null;
-		tokenInfoBuilder = null;
-		
-		System.out.println("  done");
-		System.out.println("done");
-
-		System.out.print("building unknown word dict...");
-		UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
-		UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
-		unkDictionary.write(outputDirname);
-		System.out.println("done");
-
-		System.out.print("building connection costs...");
-		ConnectionCosts connectionCosts
-			= ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
-		connectionCosts.write(outputDirname);
-		System.out.println("done");
-	}
-	
-	public static void main(String[] args) throws IOException, ClassNotFoundException {
-		DictionaryFormat format;
-		if (args[0].equalsIgnoreCase("ipadic")) {
-			format = DictionaryFormat.IPADIC;
-		} else if (args[0].equalsIgnoreCase("unidic")) {
-			format = DictionaryFormat.UNIDIC;
-		} else {
-			System.err.println("Illegal format " + args[0] + " using unidic instead");
-			format = DictionaryFormat.IPADIC;
-		}
-
-		String inputDirname = args[1];
-		String outputDirname = args[2];
-		String inputEncoding = args[3];
-		boolean normalizeEntries = Boolean.parseBoolean(args[4]);
-		
-		DictionaryBuilder builder = new DictionaryBuilder();
-		System.out.println("dictionary builder");
-		System.out.println("");
-		System.out.println("dictionary format: " + format);
-		System.out.println("input directory: " + inputDirname);
-		System.out.println("output directory: " + outputDirname);
-		System.out.println("input encoding: " + inputEncoding);
-		System.out.println("normalize entries: " + normalizeEntries);
-		System.out.println("");
-		builder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
-	}
-	
+  
+  public enum DictionaryFormat { IPADIC, UNIDIC };
+  
+  public DictionaryBuilder() {
+    
+  }
+  
+  public void build(DictionaryFormat format,
+      String inputDirname,
+      String outputDirname,
+      String encoding,
+      boolean normalizeEntry) throws IOException {
+    System.out.println("building tokeninfo dict...");
+    TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
+    TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
+    
+    System.out.print("  building double array trie...");
+    DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet());
+    trie.write(outputDirname);
+    System.out.println("  done");
+    
+    System.out.print("  processing target map...");
+    for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
+      int tokenInfoId = entry.getKey();
+      String surfaceform = entry.getValue();
+      int doubleArrayId = trie.lookup(surfaceform);
+      assert doubleArrayId > 0;
+      tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
+    }		
+    tokenInfoDictionary.write(outputDirname);
+    trie = null;
+    tokenInfoBuilder = null;
+    
+    System.out.println("  done");
+    System.out.println("done");
+    
+    System.out.print("building unknown word dict...");
+    UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
+    UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
+    unkDictionary.write(outputDirname);
+    System.out.println("done");
+    
+    System.out.print("building connection costs...");
+    ConnectionCosts connectionCosts
+    = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
+    connectionCosts.write(outputDirname);
+    System.out.println("done");
+  }
+  
+  public static void main(String[] args) throws IOException, ClassNotFoundException {
+    DictionaryFormat format;
+    if (args[0].equalsIgnoreCase("ipadic")) {
+      format = DictionaryFormat.IPADIC;
+    } else if (args[0].equalsIgnoreCase("unidic")) {
+      format = DictionaryFormat.UNIDIC;
+    } else {
+      System.err.println("Illegal format " + args[0] + " using unidic instead");
+      format = DictionaryFormat.IPADIC;
+    }
+    
+    String inputDirname = args[1];
+    String outputDirname = args[2];
+    String inputEncoding = args[3];
+    boolean normalizeEntries = Boolean.parseBoolean(args[4]);
+    
+    DictionaryBuilder builder = new DictionaryBuilder();
+    System.out.println("dictionary builder");
+    System.out.println("");
+    System.out.println("dictionary format: " + format);
+    System.out.println("input directory: " + inputDirname);
+    System.out.println("output directory: " + outputDirname);
+    System.out.println("input encoding: " + inputEncoding);
+    System.out.println("normalize entries: " + normalizeEntries);
+    System.out.println("");
+    builder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
+  }
+  
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java Tue Jan  3 04:33:56 2012
@@ -24,25 +24,25 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.trie.Trie;
 
 public class DoubleArrayTrieBuilder {
-
-	
-	public DoubleArrayTrieBuilder() {
-		
-	}
-
-	public static DoubleArrayTrie build(Set<Entry<Integer, String>> entries) {
-		Trie tempTrie = buildTrie(entries);
-		DoubleArrayTrie daTrie = new DoubleArrayTrie();
-		daTrie.build(tempTrie);
-		return daTrie;
-	}
-	
-	public static Trie buildTrie(Set<Entry<Integer, String>> entries) {
-		Trie trie = new Trie();
-		for (Entry<Integer, String> entry : entries) {
-			String surfaceForm = entry.getValue();
-			trie.add(surfaceForm);
-		}
-		return trie;
-	}
+  
+  
+  public DoubleArrayTrieBuilder() {
+    
+  }
+  
+  public static DoubleArrayTrie build(Set<Entry<Integer, String>> entries) {
+    Trie tempTrie = buildTrie(entries);
+    DoubleArrayTrie daTrie = new DoubleArrayTrie();
+    daTrie.build(tempTrie);
+    return daTrie;
+  }
+  
+  public static Trie buildTrie(Set<Entry<Integer, String>> entries) {
+    Trie trie = new Trie();
+    for (Entry<Integer, String> entry : entries) {
+      String surfaceForm = entry.getValue();
+      trie.add(surfaceForm);
+    }
+    return trie;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Tue Jan  3 04:33:56 2012
@@ -39,142 +39,142 @@ import org.apache.lucene.analysis.kuromo
  * @author Christian Moen
  */
 public class TokenInfoDictionaryBuilder {
-		
-	/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
-	private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
-
-	private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
-
-	private String encoding = "euc-jp";
-	
-	private boolean normalizeEntries = false;
-	
-	private DictionaryFormat format = DictionaryFormat.IPADIC;
-	
-	public TokenInfoDictionaryBuilder() {
-	}
-	
-	public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
-		this.format = format;
-		this.encoding = encoding;
-		this.dictionaryEntries = new TreeMap<Integer, String>();		
-		this.normalizeEntries = normalizeEntries;
-	}
-
-	public TokenInfoDictionary build(String dirname) throws IOException {
-		FilenameFilter filter = new FilenameFilter() {
-			@Override
-			public boolean accept(File dir, String name) {
-				return name.endsWith(".csv");
-			}
-		};
-		ArrayList<File> csvFiles = new ArrayList<File>();
-		for (File file : new File(dirname).listFiles(filter)) {
-			csvFiles.add(file);
-		}
-		return buildDictionary(csvFiles);
-	}
-	
-	public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
-		TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024);
-		
-		for (File file : csvFiles){
-			FileInputStream inputStream = new FileInputStream(file);
-			InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
-			BufferedReader reader = new BufferedReader(streamReader);
-
-			String line = null;
-			while ((line = reader.readLine()) != null) {
-				String[] entry = CSVUtil.parse(line);
-				if(entry.length < 13) {
-					System.out.println("Entry in CSV is not valid: " + line);
-					continue;
-				}
-				int next = dictionary.put(formatEntry(entry));
-
-				if(next == offset){
-					System.out.println("Failed to process line: " + line);
-					continue;
-				}
-				
-				dictionaryEntries.put(offset, entry[0]);
-				offset = next;
-				
-				// NFKC normalize dictionary entry
-				if (normalizeEntries) {
-					if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))){
-						continue;
-					}
-					String[] normalizedEntry = new String[entry.length];
-					for (int i = 0; i < entry.length; i++) {
-						normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
-					}
-					
-					next = dictionary.put(formatEntry(normalizedEntry));
-					dictionaryEntries.put(offset, normalizedEntry[0]);
-					offset = next;
-				}
-			}
-		}
-		
-		return dictionary;
-	}
-
-	/*
-	 * IPADIC features
-	 * 
-	 * 0	- surface
-	 * 1	- left cost
-	 * 2	- right cost
-	 * 3	- word cost
-	 * 4-9	- pos
-	 * 10	- base form
-	 * 11	- reading
-	 * 12	- pronounciation
-	 *
-	 * UniDic features
-	 * 
-	 * 0	- surface
-	 * 1	- left cost
-	 * 2	- right cost
-	 * 3	- word cost
-	 * 4-9	- pos
-	 * 10	- base form reading
-	 * 11	- base form
-	 * 12	- surface form
-	 * 13	- surface reading
-	 */
-	public String[] formatEntry(String[] features) {
-		if (this.format == DictionaryFormat.IPADIC) {
-			return features;
-		} else {
-			String[] features2 = new String[13];
-			features2[0] = features[0];
-			features2[1] = features[1];
-			features2[2] = features[2];
-			features2[3] = features[3];
-			features2[4] = features[4];
-			features2[5] = features[5];
-			features2[6] = features[6];
-			features2[7] = features[7];
-			features2[8] = features[8];
-			features2[9] = features[9];
-			features2[10] = features[11];
-			
-			// If the surface reading is non-existent, use surface form for reading and pronunciation.
-			// This happens with punctuation in UniDic and there are possibly other cases as well
-			if (features[13].length() == 0) {
-				features2[11] = features[0];
-				features2[12] = features[0];
-			} else {
-				features2[11] = features[13];
-				features2[12] = features[13];
-			}			
-			return features2;
-		}
-	}
-	
-	public Set<Entry<Integer, String>> entrySet() {
-		return dictionaryEntries.entrySet();
-	}	
+  
+  /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
+  private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
+  
+  private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
+  
+  private String encoding = "euc-jp";
+  
+  private boolean normalizeEntries = false;
+  
+  private DictionaryFormat format = DictionaryFormat.IPADIC;
+  
+  public TokenInfoDictionaryBuilder() {
+  }
+  
+  public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
+    this.format = format;
+    this.encoding = encoding;
+    this.dictionaryEntries = new TreeMap<Integer, String>();		
+    this.normalizeEntries = normalizeEntries;
+  }
+  
+  public TokenInfoDictionary build(String dirname) throws IOException {
+    FilenameFilter filter = new FilenameFilter() {
+      @Override
+      public boolean accept(File dir, String name) {
+        return name.endsWith(".csv");
+      }
+    };
+    ArrayList<File> csvFiles = new ArrayList<File>();
+    for (File file : new File(dirname).listFiles(filter)) {
+      csvFiles.add(file);
+    }
+    return buildDictionary(csvFiles);
+  }
+  
+  public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
+    TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024);
+    
+    for (File file : csvFiles){
+      FileInputStream inputStream = new FileInputStream(file);
+      InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+      BufferedReader reader = new BufferedReader(streamReader);
+      
+      String line = null;
+      while ((line = reader.readLine()) != null) {
+        String[] entry = CSVUtil.parse(line);
+        if(entry.length < 13) {
+          System.out.println("Entry in CSV is not valid: " + line);
+          continue;
+        }
+        int next = dictionary.put(formatEntry(entry));
+        
+        if(next == offset){
+          System.out.println("Failed to process line: " + line);
+          continue;
+        }
+        
+        dictionaryEntries.put(offset, entry[0]);
+        offset = next;
+        
+        // NFKC normalize dictionary entry
+        if (normalizeEntries) {
+          if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))){
+            continue;
+          }
+          String[] normalizedEntry = new String[entry.length];
+          for (int i = 0; i < entry.length; i++) {
+            normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
+          }
+          
+          next = dictionary.put(formatEntry(normalizedEntry));
+          dictionaryEntries.put(offset, normalizedEntry[0]);
+          offset = next;
+        }
+      }
+    }
+    
+    return dictionary;
+  }
+  
+  /*
+   * IPADIC features
+   * 
+   * 0	- surface
+   * 1	- left cost
+   * 2	- right cost
+   * 3	- word cost
+   * 4-9	- pos
+   * 10	- base form
+   * 11	- reading
+   * 12	- pronounciation
+   *
+   * UniDic features
+   * 
+   * 0	- surface
+   * 1	- left cost
+   * 2	- right cost
+   * 3	- word cost
+   * 4-9	- pos
+   * 10	- base form reading
+   * 11	- base form
+   * 12	- surface form
+   * 13	- surface reading
+   */
+  public String[] formatEntry(String[] features) {
+    if (this.format == DictionaryFormat.IPADIC) {
+      return features;
+    } else {
+      String[] features2 = new String[13];
+      features2[0] = features[0];
+      features2[1] = features[1];
+      features2[2] = features[2];
+      features2[3] = features[3];
+      features2[4] = features[4];
+      features2[5] = features[5];
+      features2[6] = features[6];
+      features2[7] = features[7];
+      features2[8] = features[8];
+      features2[9] = features[9];
+      features2[10] = features[11];
+      
+      // If the surface reading is non-existent, use surface form for reading and pronunciation.
+      // This happens with punctuation in UniDic and there are possibly other cases as well
+      if (features[13].length() == 0) {
+        features2[11] = features[0];
+        features2[12] = features[0];
+      } else {
+        features2[11] = features[13];
+        features2[12] = features[13];
+      }			
+      return features2;
+    }
+  }
+  
+  public Set<Entry<Integer, String>> entrySet() {
+    return dictionaryEntries.entrySet();
+  }	
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Tue Jan  3 04:33:56 2012
@@ -26,88 +26,88 @@ import java.io.LineNumberReader;
 import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
 
 public class UnknownDictionaryBuilder {
-	private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*";
-	
-	private String encoding = "euc-jp";
-	
-	public UnknownDictionaryBuilder() {
-		
-	}
-
-	public UnknownDictionaryBuilder(String encoding) {
-		this.encoding = encoding;
-	}
-	
-	public UnknownDictionary build(String dirname) throws IOException {
-		UnknownDictionary unkDictionary = null;
-		unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
-		readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
-		return unkDictionary;
-	}
-	
-	public UnknownDictionary readDictionaryFile(String filename)
-		throws IOException {
-		return readDictionaryFile(filename, encoding);
-	}
-
-	public UnknownDictionary readDictionaryFile(String filename, String encoding)
-		throws IOException {
-		UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
-		
-		FileInputStream inputStream = new FileInputStream(filename);
-		InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
-		LineNumberReader lineReader = new LineNumberReader(streamReader);
-		
-		dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
-
-		String line = null;
-		while ((line = lineReader.readLine()) != null) {
-			dictionary.put(CSVUtil.parse(line)); // Probably we don't need to validate entry
-		}
-
-		return dictionary;
-	}
-	
-	public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
-		FileInputStream inputStream = new FileInputStream(filename);
-		InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
-		LineNumberReader lineReader = new LineNumberReader(streamReader);
-
-		String line = null;
-		
-		while ((line = lineReader.readLine()) != null) {
-			line = line.replaceAll("^\\s", "");
-			line = line.replaceAll("\\s*#.*", "");
-			line = line.replaceAll("\\s+", " ");
-			
-			// Skip empty line or comment line
-			if(line.length() == 0) {
-				continue;
-			}
-			
-			if(line.startsWith("0x")) {	// Category mapping
-				String[] values = line.split(" ", 2);	// Split only first space
-				
-				if(!values[0].contains("..")) {
-					int cp = Integer.decode(values[0]).intValue();
-					dictionary.putCharacterCategory(cp, values[1]);					
-				} else {
-					String[] codePoints = values[0].split("\\.\\.");
-					int cpFrom = Integer.decode(codePoints[0]).intValue();
-					int cpTo = Integer.decode(codePoints[1]).intValue();
-					
-					for(int i = cpFrom; i <= cpTo; i++){
-						dictionary.putCharacterCategory(i, values[1]);					
-					}
-				}
-			} else {	// Invoke definition
-				String[] values = line.split(" "); // Consecutive space is merged above
-				String characterClassName = values[0];
-				int invoke = Integer.parseInt(values[1]);
-				int group = Integer.parseInt(values[2]);
-				int length = Integer.parseInt(values[3]);
-				dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
-			}
-		}
-	}
+  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*";
+  
+  private String encoding = "euc-jp";
+  
+  public UnknownDictionaryBuilder() {
+    
+  }
+  
+  public UnknownDictionaryBuilder(String encoding) {
+    this.encoding = encoding;
+  }
+  
+  public UnknownDictionary build(String dirname) throws IOException {
+    UnknownDictionary unkDictionary = null;
+    unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
+    readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
+    return unkDictionary;
+  }
+  
+  public UnknownDictionary readDictionaryFile(String filename)
+      throws IOException {
+    return readDictionaryFile(filename, encoding);
+  }
+  
+  public UnknownDictionary readDictionaryFile(String filename, String encoding)
+      throws IOException {
+    UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
+    
+    FileInputStream inputStream = new FileInputStream(filename);
+    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+    LineNumberReader lineReader = new LineNumberReader(streamReader);
+    
+    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
+    
+    String line = null;
+    while ((line = lineReader.readLine()) != null) {
+      dictionary.put(CSVUtil.parse(line)); // Probably we don't need to validate entry
+    }
+    
+    return dictionary;
+  }
+  
+  public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
+    FileInputStream inputStream = new FileInputStream(filename);
+    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+    LineNumberReader lineReader = new LineNumberReader(streamReader);
+    
+    String line = null;
+    
+    while ((line = lineReader.readLine()) != null) {
+      line = line.replaceAll("^\\s", "");
+      line = line.replaceAll("\\s*#.*", "");
+      line = line.replaceAll("\\s+", " ");
+      
+      // Skip empty line or comment line
+      if(line.length() == 0) {
+        continue;
+      }
+      
+      if(line.startsWith("0x")) {	// Category mapping
+        String[] values = line.split(" ", 2);	// Split only first space
+        
+        if(!values[0].contains("..")) {
+          int cp = Integer.decode(values[0]).intValue();
+          dictionary.putCharacterCategory(cp, values[1]);					
+        } else {
+          String[] codePoints = values[0].split("\\.\\.");
+          int cpFrom = Integer.decode(codePoints[0]).intValue();
+          int cpTo = Integer.decode(codePoints[1]).intValue();
+          
+          for(int i = cpFrom; i <= cpTo; i++){
+            dictionary.putCharacterCategory(i, values[1]);					
+          }
+        }
+      } else {	// Invoke definition
+        String[] values = line.split(" "); // Consecutive space is merged above
+        String characterClassName = values[0];
+        int invoke = Integer.parseInt(values[1]);
+        int group = Integer.parseInt(values[2]);
+        int length = Integer.parseInt(values[3]);
+        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
+      }
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java Tue Jan  3 04:33:56 2012
@@ -25,202 +25,202 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
 public class GraphvizFormatter {
-
-	private final static String BOS_LABEL = "BOS";
-
-	private final static String EOS_LABEL = "EOS";
-	
-	private final static String FONT_NAME = "Helvetica";
-
-	private ConnectionCosts costs;
-	
-	private Map<String, ViterbiNode> nodeMap;
-
-	private Map<String, String> bestPathMap;
-
-	private boolean foundBOS;
-		
-	public GraphvizFormatter(ConnectionCosts costs) {
-		this.costs = costs;
-		this.nodeMap = new HashMap<String, ViterbiNode>();
-		this.bestPathMap = new HashMap<String, String>();
-	}
-	
-	public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
-		initBestPathMap(null);
-		
-		StringBuilder sb = new StringBuilder();
-		sb.append(formatHeader());
-		sb.append(formatNodes(startsArray, endsArray));
-		sb.append(formatTrailer());
-		return sb.toString();
-	}
-	
-	public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
-
-//		List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
-		initBestPathMap(bestPath);
-
-		StringBuilder sb = new StringBuilder();
-		sb.append(formatHeader());
-		sb.append(formatNodes(startsArray, endsArray));
-		sb.append(formatTrailer());
-		return sb.toString();
-		
-	}
-	
-	private void initBestPathMap(List<ViterbiNode> bestPath) {
-		this.bestPathMap.clear();
-
-		if (bestPath == null){
-			return;
-		}
-		for (int i = 0; i < bestPath.size() - 1; i++) {
-			ViterbiNode from = bestPath.get(i);
-			ViterbiNode to = bestPath.get(i + 1);
-
-			String fromId = getNodeId(from);
-			String toId = getNodeId(to);
-			
-			assert this.bestPathMap.containsKey(fromId) == false;
-			assert this.bestPathMap.containsValue(toId) == false;
-			this.bestPathMap.put(fromId, toId);
-		}
-	}
-
-	private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
-		this.nodeMap.clear();
-		this.foundBOS = false;
-		
-		StringBuilder sb = new StringBuilder();
-		for (int i = 1; i < endsArray.length; i++) {
-			if(endsArray[i] == null || startsArray[i] == null) {
-				continue;
-			}
-			for (int j = 0; j < endsArray[i].length; j++) {
-				ViterbiNode from = endsArray[i][j];
-				if(from == null){
-					continue;
-				}
-				sb.append(formatNodeIfNew(from));
-				for (int k = 0; k < startsArray[i].length; k++) {
-					ViterbiNode to = startsArray[i][k];
-					if(to == null){
-						break;
-					}
-					sb.append(formatNodeIfNew(to));
-					sb.append(formatEdge(from, to));
-				}
-			}
-		}
-		return sb.toString();
-	}
-	
-	private String formatNodeIfNew(ViterbiNode node) {
-		String nodeId = getNodeId(node);
-		if (! this.nodeMap.containsKey(nodeId)) {
-			this.nodeMap.put(nodeId, node);
-			return formatNode(node);
-		} else {
-			return "";
-		}
-	}	
-	
-	private String formatHeader() {
-		StringBuilder sb = new StringBuilder();
-		sb.append("digraph viterbi {\n");
-		sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
-		sb.append("# A2 paper size\n");
-		sb.append("size = \"34.4,16.5\";\n");
-		sb.append("# try to fill paper\n");
-		sb.append("ratio = fill;\n");
-		sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
-		sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
-		
-		return sb.toString();
-	}
-
-	private String formatTrailer() {
-		return "}";
-	}
-	
-	
-	private String formatEdge(ViterbiNode from, ViterbiNode to) {
-		if (this.bestPathMap.containsKey(getNodeId(from)) &&
-			this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
-			return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
-
-		} else {
-			return formatEdge(from, to, "");
-		}
-	}
-
-	
-	private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
-		StringBuilder sb = new StringBuilder();
-		sb.append(getNodeId(from));
-		sb.append(" -> ");
-		sb.append(getNodeId(to));
-		sb.append(" [ ");
-		sb.append("label=\"");
-		sb.append(getCost(from, to));
-		sb.append("\"");
-		sb.append(" ");
-		sb.append(attributes);
-		sb.append(" ");
-		sb.append(" ]");
-		sb.append("\n");
-		return sb.toString();
-	}
-	
-	private String formatNode(ViterbiNode node) {
-		StringBuilder sb = new StringBuilder();
-		sb.append("\"");
-		sb.append(getNodeId(node));
-		sb.append("\"");
-		sb.append(" [ ");
-		sb.append("label=");
-		sb.append(formatNodeLabel(node));
-		sb.append(" ]");
-		return sb.toString();
-	}
-	
-	private String formatNodeLabel(ViterbiNode node) {
-		StringBuilder sb = new StringBuilder();
-		sb.append("<<table border=\"0\" cellborder=\"0\">");
-		sb.append("<tr><td>");
-		sb.append(getNodeLabel(node));
-		sb.append("</td></tr>");
-		sb.append("<tr><td>");
-		sb.append("<font color=\"blue\">");
-		sb.append(node.getWordCost());
-		sb.append("</font>");
-		sb.append("</td></tr>");
-//		sb.append("<tr><td>");
-//		sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
-//		sb.append("</td></tr>");
-		sb.append("</table>>");
-		return sb.toString();
-	}
-
-	private String getNodeId(ViterbiNode node) {
-		return String.valueOf(node.hashCode());
-	}
-	
-	private String getNodeLabel(ViterbiNode node) {
-		if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
-			if (this.foundBOS) {
-				return EOS_LABEL;
-			} else {
-				this.foundBOS = true;
-				return BOS_LABEL;
-			}
-		} else {
-			return node.getSurfaceForm();
-		}
-	}
-	
-	private int getCost(ViterbiNode from, ViterbiNode to) {
-		return this.costs.get(from.getLeftId(), to.getRightId());
-	}
+  
+  private final static String BOS_LABEL = "BOS";
+  
+  private final static String EOS_LABEL = "EOS";
+  
+  private final static String FONT_NAME = "Helvetica";
+  
+  private ConnectionCosts costs;
+  
+  private Map<String, ViterbiNode> nodeMap;
+  
+  private Map<String, String> bestPathMap;
+  
+  private boolean foundBOS;
+  
+  public GraphvizFormatter(ConnectionCosts costs) {
+    this.costs = costs;
+    this.nodeMap = new HashMap<String, ViterbiNode>();
+    this.bestPathMap = new HashMap<String, String>();
+  }
+  
+  public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+    initBestPathMap(null);
+    
+    StringBuilder sb = new StringBuilder();
+    sb.append(formatHeader());
+    sb.append(formatNodes(startsArray, endsArray));
+    sb.append(formatTrailer());
+    return sb.toString();
+  }
+  
+  public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
+    
+    //		List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
+    initBestPathMap(bestPath);
+    
+    StringBuilder sb = new StringBuilder();
+    sb.append(formatHeader());
+    sb.append(formatNodes(startsArray, endsArray));
+    sb.append(formatTrailer());
+    return sb.toString();
+    
+  }
+  
+  private void initBestPathMap(List<ViterbiNode> bestPath) {
+    this.bestPathMap.clear();
+    
+    if (bestPath == null){
+      return;
+    }
+    for (int i = 0; i < bestPath.size() - 1; i++) {
+      ViterbiNode from = bestPath.get(i);
+      ViterbiNode to = bestPath.get(i + 1);
+      
+      String fromId = getNodeId(from);
+      String toId = getNodeId(to);
+      
+      assert this.bestPathMap.containsKey(fromId) == false;
+      assert this.bestPathMap.containsValue(toId) == false;
+      this.bestPathMap.put(fromId, toId);
+    }
+  }
+  
+  private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+    this.nodeMap.clear();
+    this.foundBOS = false;
+    
+    StringBuilder sb = new StringBuilder();
+    for (int i = 1; i < endsArray.length; i++) {
+      if(endsArray[i] == null || startsArray[i] == null) {
+        continue;
+      }
+      for (int j = 0; j < endsArray[i].length; j++) {
+        ViterbiNode from = endsArray[i][j];
+        if(from == null){
+          continue;
+        }
+        sb.append(formatNodeIfNew(from));
+        for (int k = 0; k < startsArray[i].length; k++) {
+          ViterbiNode to = startsArray[i][k];
+          if(to == null){
+            break;
+          }
+          sb.append(formatNodeIfNew(to));
+          sb.append(formatEdge(from, to));
+        }
+      }
+    }
+    return sb.toString();
+  }
+  
+  private String formatNodeIfNew(ViterbiNode node) {
+    String nodeId = getNodeId(node);
+    if (! this.nodeMap.containsKey(nodeId)) {
+      this.nodeMap.put(nodeId, node);
+      return formatNode(node);
+    } else {
+      return "";
+    }
+  }	
+  
+  private String formatHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("digraph viterbi {\n");
+    sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
+    sb.append("# A2 paper size\n");
+    sb.append("size = \"34.4,16.5\";\n");
+    sb.append("# try to fill paper\n");
+    sb.append("ratio = fill;\n");
+    sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+    sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+    
+    return sb.toString();
+  }
+  
+  private String formatTrailer() {
+    return "}";
+  }
+  
+  
+  private String formatEdge(ViterbiNode from, ViterbiNode to) {
+    if (this.bestPathMap.containsKey(getNodeId(from)) &&
+        this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
+      return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
+      
+    } else {
+      return formatEdge(from, to, "");
+    }
+  }
+  
+  
+  private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getNodeId(from));
+    sb.append(" -> ");
+    sb.append(getNodeId(to));
+    sb.append(" [ ");
+    sb.append("label=\"");
+    sb.append(getCost(from, to));
+    sb.append("\"");
+    sb.append(" ");
+    sb.append(attributes);
+    sb.append(" ");
+    sb.append(" ]");
+    sb.append("\n");
+    return sb.toString();
+  }
+  
+  private String formatNode(ViterbiNode node) {
+    StringBuilder sb = new StringBuilder();
+    sb.append("\"");
+    sb.append(getNodeId(node));
+    sb.append("\"");
+    sb.append(" [ ");
+    sb.append("label=");
+    sb.append(formatNodeLabel(node));
+    sb.append(" ]");
+    return sb.toString();
+  }
+  
+  private String formatNodeLabel(ViterbiNode node) {
+    StringBuilder sb = new StringBuilder();
+    sb.append("<<table border=\"0\" cellborder=\"0\">");
+    sb.append("<tr><td>");
+    sb.append(getNodeLabel(node));
+    sb.append("</td></tr>");
+    sb.append("<tr><td>");
+    sb.append("<font color=\"blue\">");
+    sb.append(node.getWordCost());
+    sb.append("</font>");
+    sb.append("</td></tr>");
+    //		sb.append("<tr><td>");
+    //		sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
+    //		sb.append("</td></tr>");
+    sb.append("</table>>");
+    return sb.toString();
+  }
+  
+  private String getNodeId(ViterbiNode node) {
+    return String.valueOf(node.hashCode());
+  }
+  
+  private String getNodeLabel(ViterbiNode node) {
+    if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
+      if (this.foundBOS) {
+        return EOS_LABEL;
+      } else {
+        this.foundBOS = true;
+        return BOS_LABEL;
+      }
+    } else {
+      return node.getSurfaceForm();
+    }
+  }
+  
+  private int getCost(ViterbiNode from, ViterbiNode to) {
+    return this.costs.get(from.getLeftId(), to.getRightId());
+  }
 }
\ No newline at end of file