You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:33:57 UTC
svn commit: r1226637 [2/3] - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucen...
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java Tue Jan 3 04:33:56 2012
@@ -33,285 +33,285 @@ import java.nio.channels.ReadableByteCha
import org.apache.lucene.analysis.kuromoji.trie.Trie.Node;
public class DoubleArrayTrie {
-
- public static final String FILENAME = "dat.dat";
-
- public static final char TERMINATING_CHARACTER = '\u0001';
-
- private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
-
- private static final int TAIL_INITIAL_SIZE = 10000;
-
- private static final int TAIL_OFFSET = 10000000;
-
- private IntBuffer baseBuffer;
-
- private IntBuffer checkBuffer;
-
- private CharBuffer tailBuffer;
-
- private int tailIndex = TAIL_OFFSET;
-
-
- public DoubleArrayTrie(){
- }
-
- /**
- * Write to file
- * @param filename filename
- * @throws IOException
- */
- public void write(String directoryname) throws IOException {
- String filename = directoryname + File.separator + FILENAME;
-
- baseBuffer.rewind();
- checkBuffer.rewind();
- tailBuffer.rewind();
-
- File file = new File(filename);
- if(file.exists()){
- file.delete();
- }
-
- RandomAccessFile raf = new RandomAccessFile(filename, "rw");
- FileChannel channel = raf.getChannel();
- raf.writeInt(baseBuffer.capacity());
- raf.writeInt(tailBuffer.capacity());
-
- ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
- IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
- tmpIntBuffer.put(baseBuffer);
- tmpBuffer.rewind();
- channel.write(tmpBuffer);
-
- tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
- tmpIntBuffer = tmpBuffer.asIntBuffer();
- tmpIntBuffer.put(checkBuffer);
- tmpBuffer.rewind();
- channel.write(tmpBuffer);
-
- tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
- CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
- tmpCharBuffer.put(tailBuffer);
- tmpBuffer.rewind();
- channel.write(tmpBuffer);
-
- raf.close();
- }
-
- public static DoubleArrayTrie getInstance() throws IOException {
- InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(FILENAME);
- return read(is);
- }
-
- /**
- * Load Stored data
- * @param is
- * @return
- * @throws IOException
- */
- public static DoubleArrayTrie read(InputStream is) throws IOException {
- DoubleArrayTrie trie = new DoubleArrayTrie();
- DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
- int baseCheckSize = dis.readInt(); // Read size of baseArr and checkArr
- int tailSize = dis.readInt(); // Read size of tailArr
- ReadableByteChannel channel = Channels.newChannel(dis);
-
-
- ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4); // The size is 4 times the baseCheckSize since it is the length of array
- channel.read(tmpBaseBuffer);
- tmpBaseBuffer.rewind();
- trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
-
- ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
- channel.read(tmpCheckBuffer);
- tmpCheckBuffer.rewind();
- trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
-
- ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2); // The size is 2 times the tailSize since it is the length of array
- channel.read(tmpTailBuffer);
- tmpTailBuffer.rewind();
- trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
-
- is.close();
- return trie;
- }
-
- /**
- * Construct double array trie which is equivalent to input trie
- * @param trie normal trie which contains all dictionary words
- */
- public void build(Trie trie) {
- baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
- baseBuffer.put(0, 1);
- checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
- tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
- add(-1, 0, trie.getRoot());
- }
-
- /**
- * Add Node(character) to double array trie
- * @param previous
- * @param index
- * @param node
- */
- private void add(int previous, int index, Node node) {
- Node[] children = node.getChildren(); // nodes following current node
-
- if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) { // If node has only one path, put the rest in tail array
- baseBuffer.put(index, tailIndex); // current index of tail array
- addToTail(node.children[0]);
- checkBuffer.put(index, previous);
- return; // No more child to process
- }
-
- int base = findBase(index, children); // Get base value for current index
- baseBuffer.put(index, base);
-
- if(previous >= 0){
- checkBuffer.put(index, previous); // Set check value
- }
-
- for(Trie.Node child : children) { // For each child to double array trie
- add(index, index + base + child.getKey(), child);
- }
-
- }
-
- /**
- * Match input keyword.
- * @param key key to match
- * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
- */
- public int lookup(String key) {
- int index = 0;
- int base = 1; // base at index 0 should be 1
-
- int keyLength = key.length();
- for(int i = 0; i < keyLength; i++) {
- int previous = index;
- index = index + base + key.charAt(i);
-
- if(index > baseBuffer.limit()) { // Too long
- return -1;
- }
-
- base = baseBuffer.get(index);
-
- if (base == 0 ) { // Didn't find match
- return -1;
- }
-
- if(checkBuffer.get(index) != previous){ // check doesn't match
- return -1;
- }
-
- if(base >= TAIL_OFFSET) { // If base is bigger than TAIL_OFFSET, start processing "tail"
- return matchTail(base, index, key.substring(i + 1));
- }
-
- }
-
- // If we reach at the end of input keyword, check if it is complete match by looking for following terminating character
- int endIndex = index + base + TERMINATING_CHARACTER;
-
- return checkBuffer.get(endIndex) == index ? index : 0;
- }
-
- /**
- * Check match in tail array
- * @param base
- * @param index
- * @param key
- * @return index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
- */
- private int matchTail(int base, int index, String key) {
- int positionInTailArr = base - TAIL_OFFSET;
-
- int keyLength = key.length();
- for(int i = 0; i < keyLength; i++) {
- if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
- return -1;
- }
- }
- return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
-
- }
-
- /**
- * Find base value for current node, which contains input nodes. They are children of current node.
- * Set default base value , which is one, at the index of each input node.
- * @param index
- * @param nodes
- * @return base value for current node
- */
- private int findBase(int index, Node[] nodes){
- int base = baseBuffer.get(index);
- if(base < 0) {
- return base;
- }
-
- while(true) {
- boolean collision = false; // already taken?
- for(Node node : nodes) {
- /*
- * NOTE:
- * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
- * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
- * Although the size of jar file doesn't change, memory consumption will be smaller.
- */
- int nextIndex = index + base + node.getKey();
-
- if(baseBuffer.capacity() <= nextIndex) {
- int newLength = nextIndex + 1;
- IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
- baseBuffer.rewind();
- newBaseBuffer.put(baseBuffer);
- baseBuffer = newBaseBuffer;
- IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
- checkBuffer.rewind();
- newCheckBuffer.put(checkBuffer);
- checkBuffer = newCheckBuffer;
- }
-
- if(baseBuffer.get(nextIndex) != 0) { // already taken
- base++; // check next base value
- collision = true;
- break;
- }
- }
-
- if(!collision){
- break; // if there is no collision, found proper base value. Break the while loop.
- }
-
- }
-
- for(Node node : nodes) {
- baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1); // Set -1 if key is terminating character. Set default base value 1 if not.
- }
-
- return base;
- }
-
- /**
- * Add characters(nodes) to tail array
- * @param node
- */
- private void addToTail(Node node) {
- while(true) {
- if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
- CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
- tailBuffer.rewind();
- newTailBuffer.put(tailBuffer);
- tailBuffer = newTailBuffer;
- }
- tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
-
- if(node.getChildren().length == 0) { // if it reached the end of input, break.
- break;
- }
- node = node.getChildren()[0]; // Move to next node
- }
- }
+
+ public static final String FILENAME = "dat.dat";
+
+ public static final char TERMINATING_CHARACTER = '\u0001';
+
+ private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
+
+ private static final int TAIL_INITIAL_SIZE = 10000;
+
+ private static final int TAIL_OFFSET = 10000000;
+
+ private IntBuffer baseBuffer;
+
+ private IntBuffer checkBuffer;
+
+ private CharBuffer tailBuffer;
+
+ private int tailIndex = TAIL_OFFSET;
+
+
+ public DoubleArrayTrie(){
+ }
+
+ /**
+ * Write to file
+ * @param filename filename
+ * @throws IOException
+ */
+ public void write(String directoryname) throws IOException {
+ String filename = directoryname + File.separator + FILENAME;
+
+ baseBuffer.rewind();
+ checkBuffer.rewind();
+ tailBuffer.rewind();
+
+ File file = new File(filename);
+ if(file.exists()){
+ file.delete();
+ }
+
+ RandomAccessFile raf = new RandomAccessFile(filename, "rw");
+ FileChannel channel = raf.getChannel();
+ raf.writeInt(baseBuffer.capacity());
+ raf.writeInt(tailBuffer.capacity());
+
+ ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
+ IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
+ tmpIntBuffer.put(baseBuffer);
+ tmpBuffer.rewind();
+ channel.write(tmpBuffer);
+
+ tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
+ tmpIntBuffer = tmpBuffer.asIntBuffer();
+ tmpIntBuffer.put(checkBuffer);
+ tmpBuffer.rewind();
+ channel.write(tmpBuffer);
+
+ tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
+ CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
+ tmpCharBuffer.put(tailBuffer);
+ tmpBuffer.rewind();
+ channel.write(tmpBuffer);
+
+ raf.close();
+ }
+
+ public static DoubleArrayTrie getInstance() throws IOException {
+ InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(FILENAME);
+ return read(is);
+ }
+
+ /**
+ * Load Stored data
+ * @param is
+ * @return
+ * @throws IOException
+ */
+ public static DoubleArrayTrie read(InputStream is) throws IOException {
+ DoubleArrayTrie trie = new DoubleArrayTrie();
+ DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
+ int baseCheckSize = dis.readInt(); // Read size of baseArr and checkArr
+ int tailSize = dis.readInt(); // Read size of tailArr
+ ReadableByteChannel channel = Channels.newChannel(dis);
+
+
+ ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4); // The size is 4 times the baseCheckSize since it is the length of array
+ channel.read(tmpBaseBuffer);
+ tmpBaseBuffer.rewind();
+ trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
+
+ ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
+ channel.read(tmpCheckBuffer);
+ tmpCheckBuffer.rewind();
+ trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
+
+ ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2); // The size is 2 times the tailSize since it is the length of array
+ channel.read(tmpTailBuffer);
+ tmpTailBuffer.rewind();
+ trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
+
+ is.close();
+ return trie;
+ }
+
+ /**
+ * Construct double array trie which is equivalent to input trie
+ * @param trie normal trie which contains all dictionary words
+ */
+ public void build(Trie trie) {
+ baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+ baseBuffer.put(0, 1);
+ checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
+ tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
+ add(-1, 0, trie.getRoot());
+ }
+
+ /**
+ * Add Node(character) to double array trie
+ * @param previous
+ * @param index
+ * @param node
+ */
+ private void add(int previous, int index, Node node) {
+ Node[] children = node.getChildren(); // nodes following current node
+
+ if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) { // If node has only one path, put the rest in tail array
+ baseBuffer.put(index, tailIndex); // current index of tail array
+ addToTail(node.children[0]);
+ checkBuffer.put(index, previous);
+ return; // No more child to process
+ }
+
+ int base = findBase(index, children); // Get base value for current index
+ baseBuffer.put(index, base);
+
+ if(previous >= 0){
+ checkBuffer.put(index, previous); // Set check value
+ }
+
+ for(Trie.Node child : children) { // For each child to double array trie
+ add(index, index + base + child.getKey(), child);
+ }
+
+ }
+
+ /**
+ * Match input keyword.
+ * @param key key to match
+ * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
+ */
+ public int lookup(String key) {
+ int index = 0;
+ int base = 1; // base at index 0 should be 1
+
+ int keyLength = key.length();
+ for(int i = 0; i < keyLength; i++) {
+ int previous = index;
+ index = index + base + key.charAt(i);
+
+ if(index > baseBuffer.limit()) { // Too long
+ return -1;
+ }
+
+ base = baseBuffer.get(index);
+
+ if (base == 0 ) { // Didn't find match
+ return -1;
+ }
+
+ if(checkBuffer.get(index) != previous){ // check doesn't match
+ return -1;
+ }
+
+ if(base >= TAIL_OFFSET) { // If base is bigger than TAIL_OFFSET, start processing "tail"
+ return matchTail(base, index, key.substring(i + 1));
+ }
+
+ }
+
+ // If we reach at the end of input keyword, check if it is complete match by looking for following terminating character
+ int endIndex = index + base + TERMINATING_CHARACTER;
+
+ return checkBuffer.get(endIndex) == index ? index : 0;
+ }
+
+ /**
+ * Check match in tail array
+ * @param base
+ * @param index
+ * @param key
+ * @return index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
+ */
+ private int matchTail(int base, int index, String key) {
+ int positionInTailArr = base - TAIL_OFFSET;
+
+ int keyLength = key.length();
+ for(int i = 0; i < keyLength; i++) {
+ if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
+ return -1;
+ }
+ }
+ return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
+
+ }
+
+ /**
+ * Find base value for current node, which contains input nodes. They are children of current node.
+ * Set default base value , which is one, at the index of each input node.
+ * @param index
+ * @param nodes
+ * @return base value for current node
+ */
+ private int findBase(int index, Node[] nodes){
+ int base = baseBuffer.get(index);
+ if(base < 0) {
+ return base;
+ }
+
+ while(true) {
+ boolean collision = false; // already taken?
+ for(Node node : nodes) {
+ /*
+ * NOTE:
+ * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
+ * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
+ * Although the size of jar file doesn't change, memory consumption will be smaller.
+ */
+ int nextIndex = index + base + node.getKey();
+
+ if(baseBuffer.capacity() <= nextIndex) {
+ int newLength = nextIndex + 1;
+ IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+ baseBuffer.rewind();
+ newBaseBuffer.put(baseBuffer);
+ baseBuffer = newBaseBuffer;
+ IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
+ checkBuffer.rewind();
+ newCheckBuffer.put(checkBuffer);
+ checkBuffer = newCheckBuffer;
+ }
+
+ if(baseBuffer.get(nextIndex) != 0) { // already taken
+ base++; // check next base value
+ collision = true;
+ break;
+ }
+ }
+
+ if(!collision){
+ break; // if there is no collision, found proper base value. Break the while loop.
+ }
+
+ }
+
+ for(Node node : nodes) {
+ baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1); // Set -1 if key is terminating character. Set default base value 1 if not.
+ }
+
+ return base;
+ }
+
+ /**
+ * Add characters(nodes) to tail array
+ * @param node
+ */
+ private void addToTail(Node node) {
+ while(true) {
+ if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
+ CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
+ tailBuffer.rewind();
+ newTailBuffer.put(tailBuffer);
+ tailBuffer = newTailBuffer;
+ }
+ tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
+
+ if(node.getChildren().length == 0) { // if it reached the end of input, break.
+ break;
+ }
+ node = node.getChildren()[0]; // Move to next node
+ }
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java Tue Jan 3 04:33:56 2012
@@ -18,131 +18,131 @@ package org.apache.lucene.analysis.kurom
*/
public class Trie {
-
- private Node root; // Root node of Trie
-
- /**
- * Constructor
- * Initialize Trie with empty root node
- */
- public Trie() {
- root = new Node();
- }
-
- /**
- * Add input value into Trie
- * Before adding, it adds terminating character(\u0001) to input string
- * @param value String to add to Trie
- */
- public void add(String value) {
- root.add(value + DoubleArrayTrie.TERMINATING_CHARACTER);
- }
-
- /**
- * Return root node which contains other nodes
- * @return Node
- */
- public Node getRoot() {
- return root;
- }
-
- /**
- * Trie Node
- */
- public class Node {
- char key; // key(char) of this node
-
- Node[] children = new Node[0]; // Array to hold children nodes
-
- /**
- * Constructor
- */
- public Node() {
- }
-
- /**
- * Constructor
- * @param key key for this node
- */
- public Node(char key) {
- this.key = key;
- }
-
- /**
- * Add string to Trie
- * @param value String to add
- */
- public void add(String value) {
- if (value.length() == 0) {
- return;
- }
-
- Node node = new Node(value.charAt(0));
- addChild(node).add(value.substring(1));
- }
-
- /**
- * Add Node to this node as child
- * @param newNode node to add
- * @return added node. If a node with same key already exists, return that node.
- */
- public Node addChild(Node newNode) {
- Node child = getChild(newNode.getKey());
- if (child == null) {
- Node[] newChildren = new Node[children.length + 1];
- System.arraycopy(children, 0, newChildren, 0, children.length);
- newChildren[newChildren.length -1] = newNode;
- children = newChildren;
- child = newNode;
- }
- return child;
- }
-
- /**
- * Return the key of the node
- * @return key
- */
- public char getKey() {
- return key;
- }
-
- /**
- * Check if children following this node has only single path.
- * For example, if you have "abcde" and "abfgh" in Trie, calling this method on node "a" and "b" returns false.
- * Calling this method on "c", "d", "e", "f", "g" and "h" returns true.
- * @return true if it has only single path. false if it has multiple path.
- */
- public boolean hasSinglePath() {
- switch(children.length){
- case 0:
- return true;
- case 1:
- return children[0].hasSinglePath();
- default:
- return false;
- }
- }
-
- /**
- * Return children node
- * @return Array of children nodes
- */
- public Node[] getChildren() {
- return children;
- }
-
- /**
- * Return node which has input key
- * @param key key to look for
- * @return node which has input key. null if it doesn't exist.
- */
- private Node getChild(char key) {
- for (Node child : children) {
- if (child.getKey() == key) {
- return child;
- }
- }
- return null;
- }
- }
+
+ private Node root; // Root node of Trie
+
+ /**
+ * Constructor
+ * Initialize Trie with empty root node
+ */
+ public Trie() {
+ root = new Node();
+ }
+
+ /**
+ * Add input value into Trie
+ * Before adding, it adds terminating character(\u0001) to input string
+ * @param value String to add to Trie
+ */
+ public void add(String value) {
+ root.add(value + DoubleArrayTrie.TERMINATING_CHARACTER);
+ }
+
+ /**
+ * Return root node which contains other nodes
+ * @return Node
+ */
+ public Node getRoot() {
+ return root;
+ }
+
+ /**
+ * Trie Node
+ */
+ public class Node {
+ char key; // key(char) of this node
+
+ Node[] children = new Node[0]; // Array to hold children nodes
+
+ /**
+ * Constructor
+ */
+ public Node() {
+ }
+
+ /**
+ * Constructor
+ * @param key key for this node
+ */
+ public Node(char key) {
+ this.key = key;
+ }
+
+ /**
+ * Add string to Trie
+ * @param value String to add
+ */
+ public void add(String value) {
+ if (value.length() == 0) {
+ return;
+ }
+
+ Node node = new Node(value.charAt(0));
+ addChild(node).add(value.substring(1));
+ }
+
+ /**
+ * Add Node to this node as child
+ * @param newNode node to add
+ * @return added node. If a node with same key already exists, return that node.
+ */
+ public Node addChild(Node newNode) {
+ Node child = getChild(newNode.getKey());
+ if (child == null) {
+ Node[] newChildren = new Node[children.length + 1];
+ System.arraycopy(children, 0, newChildren, 0, children.length);
+ newChildren[newChildren.length -1] = newNode;
+ children = newChildren;
+ child = newNode;
+ }
+ return child;
+ }
+
+ /**
+ * Return the key of the node
+ * @return key
+ */
+ public char getKey() {
+ return key;
+ }
+
+ /**
+ * Check if children following this node has only single path.
+ * For example, if you have "abcde" and "abfgh" in Trie, calling this method on node "a" and "b" returns false.
+ * Calling this method on "c", "d", "e", "f", "g" and "h" returns true.
+ * @return true if it has only single path. false if it has multiple path.
+ */
+ public boolean hasSinglePath() {
+ switch(children.length){
+ case 0:
+ return true;
+ case 1:
+ return children[0].hasSinglePath();
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Return children node
+ * @return Array of children nodes
+ */
+ public Node[] getChildren() {
+ return children;
+ }
+
+ /**
+ * Return node which has input key
+ * @param key key to look for
+ * @return node which has input key. null if it doesn't exist.
+ */
+ private Node getChild(char key) {
+ for (Node child : children) {
+ if (child.getKey() == key) {
+ return child;
+ }
+ }
+ return null;
+ }
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java Tue Jan 3 04:33:56 2012
@@ -22,80 +22,80 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CSVUtil {
- private static final char QUOTE = '"';
-
- private static final char COMMA = ',';
-
- private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
-
- private static final String ESCAPED_QUOTE = "\"\"";
-
- /**
- * Parse CSV line
- * @param line
- * @return Array of values
- */
- public static String[] parse(String line) {
- boolean insideQuote = false;
- ArrayList<String> result = new ArrayList<String>();
- int quoteCount = 0;
- StringBuilder sb = new StringBuilder();
- for(int i = 0; i < line.length(); i++) {
- char c = line.charAt(i);
-
- if(c == QUOTE) {
- insideQuote = !insideQuote;
- quoteCount++;
- }
-
- if(c == COMMA && !insideQuote) {
- String value = sb.toString();
- value = unQuoteUnEscape(value);
- result.add(value);
- sb = new StringBuilder();
- continue;
- }
-
- sb.append(c);
- }
-
- result.add(sb.toString());
-
- // Validate
- if(quoteCount % 2 != 0) {
- return new String[0];
- }
-
- return result.toArray(new String[result.size()]);
- }
-
- private static String unQuoteUnEscape(String original) {
- String result = original;
-
- // Unquote
- Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
- if(m.matches()) {
- result = m.group(1);
- }
-
- // Unescape
- result = result.replaceAll(ESCAPED_QUOTE, "\"");
-
- return result;
-
- }
-
- /**
- * Quote and escape input value for CSV
- * @param original
- * @return
- */
- public static String quoteEscape(String original) {
- String result = original.replaceAll("\"", ESCAPED_QUOTE);
- if(result.indexOf(COMMA) >= 0) {
- result = "\"" + result + "\"";
- }
- return result;
- }
-
+ private static final char QUOTE = '"';
+
+ private static final char COMMA = ',';
+
+ private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
+
+ private static final String ESCAPED_QUOTE = "\"\"";
+
+ /**
+ * Parse CSV line
+ * @param line
+ * @return Array of values
+ */
+ public static String[] parse(String line) {
+ boolean insideQuote = false;
+ ArrayList<String> result = new ArrayList<String>();
+ int quoteCount = 0;
+ StringBuilder sb = new StringBuilder();
+ for(int i = 0; i < line.length(); i++) {
+ char c = line.charAt(i);
+
+ if(c == QUOTE) {
+ insideQuote = !insideQuote;
+ quoteCount++;
+ }
+
+ if(c == COMMA && !insideQuote) {
+ String value = sb.toString();
+ value = unQuoteUnEscape(value);
+ result.add(value);
+ sb = new StringBuilder();
+ continue;
+ }
+
+ sb.append(c);
+ }
+
+ result.add(sb.toString());
+
+ // Validate
+ if(quoteCount % 2 != 0) {
+ return new String[0];
+ }
+
+ return result.toArray(new String[result.size()]);
+ }
+
+ private static String unQuoteUnEscape(String original) {
+ String result = original;
+
+ // Unquote
+ Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
+ if(m.matches()) {
+ result = m.group(1);
+ }
+
+ // Unescape
+ result = result.replaceAll(ESCAPED_QUOTE, "\"");
+
+ return result;
+
+ }
+
+ /**
+ * Quote and escape input value for CSV
+ * @param original
+ * @return
+ */
+ public static String quoteEscape(String original) {
+ String result = original.replaceAll("\"", ESCAPED_QUOTE);
+ if(result.indexOf(COMMA) >= 0) {
+ result = "\"" + result + "\"";
+ }
+ return result;
+ }
+
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java Tue Jan 3 04:33:56 2012
@@ -25,39 +25,39 @@ import java.io.LineNumberReader;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
public class ConnectionCostsBuilder {
-
- public ConnectionCostsBuilder() {
-
- }
-
- public static ConnectionCosts build(String filename) throws IOException {
- FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream);
- LineNumberReader lineReader = new LineNumberReader(streamReader);
-
- String line = lineReader.readLine();
- String[] dimensions = line.split("\\s+");
-
- assert dimensions.length == 3;
-
- int forwardSize = Integer.parseInt(dimensions[0]);
- int backwardSize = Integer.parseInt(dimensions[1]);
-
- assert forwardSize > 0 && backwardSize > 0;
-
- ConnectionCosts costs = new ConnectionCosts(forwardSize, backwardSize);
-
- while ((line = lineReader.readLine()) != null) {
- String[] fields = line.split("\\s+");
-
- assert fields.length == 3;
-
- int forwardId = Integer.parseInt(fields[0]);
- int backwardId = Integer.parseInt(fields[1]);
- int cost = Integer.parseInt(fields[2]);
-
- costs.add(forwardId, backwardId, cost);
- }
- return costs;
- }
+
+ public ConnectionCostsBuilder() {
+
+ }
+
+ public static ConnectionCosts build(String filename) throws IOException {
+ FileInputStream inputStream = new FileInputStream(filename);
+ InputStreamReader streamReader = new InputStreamReader(inputStream);
+ LineNumberReader lineReader = new LineNumberReader(streamReader);
+
+ String line = lineReader.readLine();
+ String[] dimensions = line.split("\\s+");
+
+ assert dimensions.length == 3;
+
+ int forwardSize = Integer.parseInt(dimensions[0]);
+ int backwardSize = Integer.parseInt(dimensions[1]);
+
+ assert forwardSize > 0 && backwardSize > 0;
+
+ ConnectionCosts costs = new ConnectionCosts(forwardSize, backwardSize);
+
+ while ((line = lineReader.readLine()) != null) {
+ String[] fields = line.split("\\s+");
+
+ assert fields.length == 3;
+
+ int forwardId = Integer.parseInt(fields[0]);
+ int backwardId = Integer.parseInt(fields[1]);
+ int cost = Integer.parseInt(fields[2]);
+
+ costs.add(forwardId, backwardId, cost);
+ }
+ return costs;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java Tue Jan 3 04:33:56 2012
@@ -27,81 +27,81 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
public class DictionaryBuilder {
-
- public enum DictionaryFormat { IPADIC, UNIDIC };
-
- public DictionaryBuilder() {
-
- }
-
- public void build(DictionaryFormat format,
- String inputDirname,
- String outputDirname,
- String encoding,
- boolean normalizeEntry) throws IOException {
- System.out.println("building tokeninfo dict...");
- TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
- TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
-
- System.out.print(" building double array trie...");
- DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet());
- trie.write(outputDirname);
- System.out.println(" done");
-
- System.out.print(" processing target map...");
- for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
- int tokenInfoId = entry.getKey();
- String surfaceform = entry.getValue();
- int doubleArrayId = trie.lookup(surfaceform);
- assert doubleArrayId > 0;
- tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
- }
- tokenInfoDictionary.write(outputDirname);
- trie = null;
- tokenInfoBuilder = null;
-
- System.out.println(" done");
- System.out.println("done");
-
- System.out.print("building unknown word dict...");
- UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
- UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
- unkDictionary.write(outputDirname);
- System.out.println("done");
-
- System.out.print("building connection costs...");
- ConnectionCosts connectionCosts
- = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
- connectionCosts.write(outputDirname);
- System.out.println("done");
- }
-
- public static void main(String[] args) throws IOException, ClassNotFoundException {
- DictionaryFormat format;
- if (args[0].equalsIgnoreCase("ipadic")) {
- format = DictionaryFormat.IPADIC;
- } else if (args[0].equalsIgnoreCase("unidic")) {
- format = DictionaryFormat.UNIDIC;
- } else {
- System.err.println("Illegal format " + args[0] + " using unidic instead");
- format = DictionaryFormat.IPADIC;
- }
-
- String inputDirname = args[1];
- String outputDirname = args[2];
- String inputEncoding = args[3];
- boolean normalizeEntries = Boolean.parseBoolean(args[4]);
-
- DictionaryBuilder builder = new DictionaryBuilder();
- System.out.println("dictionary builder");
- System.out.println("");
- System.out.println("dictionary format: " + format);
- System.out.println("input directory: " + inputDirname);
- System.out.println("output directory: " + outputDirname);
- System.out.println("input encoding: " + inputEncoding);
- System.out.println("normalize entries: " + normalizeEntries);
- System.out.println("");
- builder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
- }
-
+
+ public enum DictionaryFormat { IPADIC, UNIDIC };
+
+ public DictionaryBuilder() {
+
+ }
+
+ public void build(DictionaryFormat format,
+ String inputDirname,
+ String outputDirname,
+ String encoding,
+ boolean normalizeEntry) throws IOException {
+ System.out.println("building tokeninfo dict...");
+ TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
+ TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
+
+ System.out.print(" building double array trie...");
+ DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet());
+ trie.write(outputDirname);
+ System.out.println(" done");
+
+ System.out.print(" processing target map...");
+ for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
+ int tokenInfoId = entry.getKey();
+ String surfaceform = entry.getValue();
+ int doubleArrayId = trie.lookup(surfaceform);
+ assert doubleArrayId > 0;
+ tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
+ }
+ tokenInfoDictionary.write(outputDirname);
+ trie = null;
+ tokenInfoBuilder = null;
+
+ System.out.println(" done");
+ System.out.println("done");
+
+ System.out.print("building unknown word dict...");
+ UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
+ UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
+ unkDictionary.write(outputDirname);
+ System.out.println("done");
+
+ System.out.print("building connection costs...");
+ ConnectionCosts connectionCosts
+ = ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
+ connectionCosts.write(outputDirname);
+ System.out.println("done");
+ }
+
+ public static void main(String[] args) throws IOException, ClassNotFoundException {
+ DictionaryFormat format;
+ if (args[0].equalsIgnoreCase("ipadic")) {
+ format = DictionaryFormat.IPADIC;
+ } else if (args[0].equalsIgnoreCase("unidic")) {
+ format = DictionaryFormat.UNIDIC;
+ } else {
+ System.err.println("Illegal format " + args[0] + " using unidic instead");
+ format = DictionaryFormat.IPADIC;
+ }
+
+ String inputDirname = args[1];
+ String outputDirname = args[2];
+ String inputEncoding = args[3];
+ boolean normalizeEntries = Boolean.parseBoolean(args[4]);
+
+ DictionaryBuilder builder = new DictionaryBuilder();
+ System.out.println("dictionary builder");
+ System.out.println("");
+ System.out.println("dictionary format: " + format);
+ System.out.println("input directory: " + inputDirname);
+ System.out.println("output directory: " + outputDirname);
+ System.out.println("input encoding: " + inputEncoding);
+ System.out.println("normalize entries: " + normalizeEntries);
+ System.out.println("");
+ builder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
+ }
+
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java Tue Jan 3 04:33:56 2012
@@ -24,25 +24,25 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.trie.Trie;
public class DoubleArrayTrieBuilder {
-
-
- public DoubleArrayTrieBuilder() {
-
- }
-
- public static DoubleArrayTrie build(Set<Entry<Integer, String>> entries) {
- Trie tempTrie = buildTrie(entries);
- DoubleArrayTrie daTrie = new DoubleArrayTrie();
- daTrie.build(tempTrie);
- return daTrie;
- }
-
- public static Trie buildTrie(Set<Entry<Integer, String>> entries) {
- Trie trie = new Trie();
- for (Entry<Integer, String> entry : entries) {
- String surfaceForm = entry.getValue();
- trie.add(surfaceForm);
- }
- return trie;
- }
+
+
+ public DoubleArrayTrieBuilder() {
+
+ }
+
+ public static DoubleArrayTrie build(Set<Entry<Integer, String>> entries) {
+ Trie tempTrie = buildTrie(entries);
+ DoubleArrayTrie daTrie = new DoubleArrayTrie();
+ daTrie.build(tempTrie);
+ return daTrie;
+ }
+
+ public static Trie buildTrie(Set<Entry<Integer, String>> entries) {
+ Trie trie = new Trie();
+ for (Entry<Integer, String> entry : entries) {
+ String surfaceForm = entry.getValue();
+ trie.add(surfaceForm);
+ }
+ return trie;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Tue Jan 3 04:33:56 2012
@@ -39,142 +39,142 @@ import org.apache.lucene.analysis.kuromo
* @author Christian Moen
*/
public class TokenInfoDictionaryBuilder {
-
- /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
- private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
-
- private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
-
- private String encoding = "euc-jp";
-
- private boolean normalizeEntries = false;
-
- private DictionaryFormat format = DictionaryFormat.IPADIC;
-
- public TokenInfoDictionaryBuilder() {
- }
-
- public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
- this.format = format;
- this.encoding = encoding;
- this.dictionaryEntries = new TreeMap<Integer, String>();
- this.normalizeEntries = normalizeEntries;
- }
-
- public TokenInfoDictionary build(String dirname) throws IOException {
- FilenameFilter filter = new FilenameFilter() {
- @Override
- public boolean accept(File dir, String name) {
- return name.endsWith(".csv");
- }
- };
- ArrayList<File> csvFiles = new ArrayList<File>();
- for (File file : new File(dirname).listFiles(filter)) {
- csvFiles.add(file);
- }
- return buildDictionary(csvFiles);
- }
-
- public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
- TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024);
-
- for (File file : csvFiles){
- FileInputStream inputStream = new FileInputStream(file);
- InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
- BufferedReader reader = new BufferedReader(streamReader);
-
- String line = null;
- while ((line = reader.readLine()) != null) {
- String[] entry = CSVUtil.parse(line);
- if(entry.length < 13) {
- System.out.println("Entry in CSV is not valid: " + line);
- continue;
- }
- int next = dictionary.put(formatEntry(entry));
-
- if(next == offset){
- System.out.println("Failed to process line: " + line);
- continue;
- }
-
- dictionaryEntries.put(offset, entry[0]);
- offset = next;
-
- // NFKC normalize dictionary entry
- if (normalizeEntries) {
- if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))){
- continue;
- }
- String[] normalizedEntry = new String[entry.length];
- for (int i = 0; i < entry.length; i++) {
- normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
- }
-
- next = dictionary.put(formatEntry(normalizedEntry));
- dictionaryEntries.put(offset, normalizedEntry[0]);
- offset = next;
- }
- }
- }
-
- return dictionary;
- }
-
- /*
- * IPADIC features
- *
- * 0 - surface
- * 1 - left cost
- * 2 - right cost
- * 3 - word cost
- * 4-9 - pos
- * 10 - base form
- * 11 - reading
- * 12 - pronounciation
- *
- * UniDic features
- *
- * 0 - surface
- * 1 - left cost
- * 2 - right cost
- * 3 - word cost
- * 4-9 - pos
- * 10 - base form reading
- * 11 - base form
- * 12 - surface form
- * 13 - surface reading
- */
- public String[] formatEntry(String[] features) {
- if (this.format == DictionaryFormat.IPADIC) {
- return features;
- } else {
- String[] features2 = new String[13];
- features2[0] = features[0];
- features2[1] = features[1];
- features2[2] = features[2];
- features2[3] = features[3];
- features2[4] = features[4];
- features2[5] = features[5];
- features2[6] = features[6];
- features2[7] = features[7];
- features2[8] = features[8];
- features2[9] = features[9];
- features2[10] = features[11];
-
- // If the surface reading is non-existent, use surface form for reading and pronunciation.
- // This happens with punctuation in UniDic and there are possibly other cases as well
- if (features[13].length() == 0) {
- features2[11] = features[0];
- features2[12] = features[0];
- } else {
- features2[11] = features[13];
- features2[12] = features[13];
- }
- return features2;
- }
- }
-
- public Set<Entry<Integer, String>> entrySet() {
- return dictionaryEntries.entrySet();
- }
+
+ /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
+ private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
+
+ private TreeMap<Integer, String> dictionaryEntries; // wordId, surface form
+
+ private String encoding = "euc-jp";
+
+ private boolean normalizeEntries = false;
+
+ private DictionaryFormat format = DictionaryFormat.IPADIC;
+
+ public TokenInfoDictionaryBuilder() {
+ }
+
+ public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
+ this.format = format;
+ this.encoding = encoding;
+ this.dictionaryEntries = new TreeMap<Integer, String>();
+ this.normalizeEntries = normalizeEntries;
+ }
+
+ public TokenInfoDictionary build(String dirname) throws IOException {
+ FilenameFilter filter = new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.endsWith(".csv");
+ }
+ };
+ ArrayList<File> csvFiles = new ArrayList<File>();
+ for (File file : new File(dirname).listFiles(filter)) {
+ csvFiles.add(file);
+ }
+ return buildDictionary(csvFiles);
+ }
+
+ public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
+ TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024);
+
+ for (File file : csvFiles){
+ FileInputStream inputStream = new FileInputStream(file);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+ BufferedReader reader = new BufferedReader(streamReader);
+
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ String[] entry = CSVUtil.parse(line);
+ if(entry.length < 13) {
+ System.out.println("Entry in CSV is not valid: " + line);
+ continue;
+ }
+ int next = dictionary.put(formatEntry(entry));
+
+ if(next == offset){
+ System.out.println("Failed to process line: " + line);
+ continue;
+ }
+
+ dictionaryEntries.put(offset, entry[0]);
+ offset = next;
+
+ // NFKC normalize dictionary entry
+ if (normalizeEntries) {
+ if (entry[0].equals(Normalizer.normalize(entry[0], Normalizer.Form.NFKC))){
+ continue;
+ }
+ String[] normalizedEntry = new String[entry.length];
+ for (int i = 0; i < entry.length; i++) {
+ normalizedEntry[i] = Normalizer.normalize(entry[i], Normalizer.Form.NFKC);
+ }
+
+ next = dictionary.put(formatEntry(normalizedEntry));
+ dictionaryEntries.put(offset, normalizedEntry[0]);
+ offset = next;
+ }
+ }
+ }
+
+ return dictionary;
+ }
+
+ /*
+ * IPADIC features
+ *
+ * 0 - surface
+ * 1 - left cost
+ * 2 - right cost
+ * 3 - word cost
+ * 4-9 - pos
+ * 10 - base form
+ * 11 - reading
+ * 12 - pronounciation
+ *
+ * UniDic features
+ *
+ * 0 - surface
+ * 1 - left cost
+ * 2 - right cost
+ * 3 - word cost
+ * 4-9 - pos
+ * 10 - base form reading
+ * 11 - base form
+ * 12 - surface form
+ * 13 - surface reading
+ */
+ public String[] formatEntry(String[] features) {
+ if (this.format == DictionaryFormat.IPADIC) {
+ return features;
+ } else {
+ String[] features2 = new String[13];
+ features2[0] = features[0];
+ features2[1] = features[1];
+ features2[2] = features[2];
+ features2[3] = features[3];
+ features2[4] = features[4];
+ features2[5] = features[5];
+ features2[6] = features[6];
+ features2[7] = features[7];
+ features2[8] = features[8];
+ features2[9] = features[9];
+ features2[10] = features[11];
+
+ // If the surface reading is non-existent, use surface form for reading and pronunciation.
+ // This happens with punctuation in UniDic and there are possibly other cases as well
+ if (features[13].length() == 0) {
+ features2[11] = features[0];
+ features2[12] = features[0];
+ } else {
+ features2[11] = features[13];
+ features2[12] = features[13];
+ }
+ return features2;
+ }
+ }
+
+ public Set<Entry<Integer, String>> entrySet() {
+ return dictionaryEntries.entrySet();
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Tue Jan 3 04:33:56 2012
@@ -26,88 +26,88 @@ import java.io.LineNumberReader;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
public class UnknownDictionaryBuilder {
- private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*";
-
- private String encoding = "euc-jp";
-
- public UnknownDictionaryBuilder() {
-
- }
-
- public UnknownDictionaryBuilder(String encoding) {
- this.encoding = encoding;
- }
-
- public UnknownDictionary build(String dirname) throws IOException {
- UnknownDictionary unkDictionary = null;
- unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
- readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
- return unkDictionary;
- }
-
- public UnknownDictionary readDictionaryFile(String filename)
- throws IOException {
- return readDictionaryFile(filename, encoding);
- }
-
- public UnknownDictionary readDictionaryFile(String filename, String encoding)
- throws IOException {
- UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
-
- FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
- LineNumberReader lineReader = new LineNumberReader(streamReader);
-
- dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
-
- String line = null;
- while ((line = lineReader.readLine()) != null) {
- dictionary.put(CSVUtil.parse(line)); // Probably we don't need to validate entry
- }
-
- return dictionary;
- }
-
- public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
- FileInputStream inputStream = new FileInputStream(filename);
- InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
- LineNumberReader lineReader = new LineNumberReader(streamReader);
-
- String line = null;
-
- while ((line = lineReader.readLine()) != null) {
- line = line.replaceAll("^\\s", "");
- line = line.replaceAll("\\s*#.*", "");
- line = line.replaceAll("\\s+", " ");
-
- // Skip empty line or comment line
- if(line.length() == 0) {
- continue;
- }
-
- if(line.startsWith("0x")) { // Category mapping
- String[] values = line.split(" ", 2); // Split only first space
-
- if(!values[0].contains("..")) {
- int cp = Integer.decode(values[0]).intValue();
- dictionary.putCharacterCategory(cp, values[1]);
- } else {
- String[] codePoints = values[0].split("\\.\\.");
- int cpFrom = Integer.decode(codePoints[0]).intValue();
- int cpTo = Integer.decode(codePoints[1]).intValue();
-
- for(int i = cpFrom; i <= cpTo; i++){
- dictionary.putCharacterCategory(i, values[1]);
- }
- }
- } else { // Invoke definition
- String[] values = line.split(" "); // Consecutive space is merged above
- String characterClassName = values[0];
- int invoke = Integer.parseInt(values[1]);
- int group = Integer.parseInt(values[2]);
- int length = Integer.parseInt(values[3]);
- dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
- }
- }
- }
+ private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*";
+
+ private String encoding = "euc-jp";
+
+ public UnknownDictionaryBuilder() {
+
+ }
+
+ public UnknownDictionaryBuilder(String encoding) {
+ this.encoding = encoding;
+ }
+
+ public UnknownDictionary build(String dirname) throws IOException {
+ UnknownDictionary unkDictionary = null;
+ unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
+ readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
+ return unkDictionary;
+ }
+
+ public UnknownDictionary readDictionaryFile(String filename)
+ throws IOException {
+ return readDictionaryFile(filename, encoding);
+ }
+
+ public UnknownDictionary readDictionaryFile(String filename, String encoding)
+ throws IOException {
+ UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
+
+ FileInputStream inputStream = new FileInputStream(filename);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+ LineNumberReader lineReader = new LineNumberReader(streamReader);
+
+ dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
+
+ String line = null;
+ while ((line = lineReader.readLine()) != null) {
+ dictionary.put(CSVUtil.parse(line)); // Probably we don't need to validate entry
+ }
+
+ return dictionary;
+ }
+
+ public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
+ FileInputStream inputStream = new FileInputStream(filename);
+ InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
+ LineNumberReader lineReader = new LineNumberReader(streamReader);
+
+ String line = null;
+
+ while ((line = lineReader.readLine()) != null) {
+ line = line.replaceAll("^\\s", "");
+ line = line.replaceAll("\\s*#.*", "");
+ line = line.replaceAll("\\s+", " ");
+
+ // Skip empty line or comment line
+ if(line.length() == 0) {
+ continue;
+ }
+
+ if(line.startsWith("0x")) { // Category mapping
+ String[] values = line.split(" ", 2); // Split only first space
+
+ if(!values[0].contains("..")) {
+ int cp = Integer.decode(values[0]).intValue();
+ dictionary.putCharacterCategory(cp, values[1]);
+ } else {
+ String[] codePoints = values[0].split("\\.\\.");
+ int cpFrom = Integer.decode(codePoints[0]).intValue();
+ int cpTo = Integer.decode(codePoints[1]).intValue();
+
+ for(int i = cpFrom; i <= cpTo; i++){
+ dictionary.putCharacterCategory(i, values[1]);
+ }
+ }
+ } else { // Invoke definition
+ String[] values = line.split(" "); // Consecutive space is merged above
+ String characterClassName = values[0];
+ int invoke = Integer.parseInt(values[1]);
+ int group = Integer.parseInt(values[2]);
+ int length = Integer.parseInt(values[3]);
+ dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
+ }
+ }
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java Tue Jan 3 04:33:56 2012
@@ -25,202 +25,202 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class GraphvizFormatter {
-
- private final static String BOS_LABEL = "BOS";
-
- private final static String EOS_LABEL = "EOS";
-
- private final static String FONT_NAME = "Helvetica";
-
- private ConnectionCosts costs;
-
- private Map<String, ViterbiNode> nodeMap;
-
- private Map<String, String> bestPathMap;
-
- private boolean foundBOS;
-
- public GraphvizFormatter(ConnectionCosts costs) {
- this.costs = costs;
- this.nodeMap = new HashMap<String, ViterbiNode>();
- this.bestPathMap = new HashMap<String, String>();
- }
-
- public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
- initBestPathMap(null);
-
- StringBuilder sb = new StringBuilder();
- sb.append(formatHeader());
- sb.append(formatNodes(startsArray, endsArray));
- sb.append(formatTrailer());
- return sb.toString();
- }
-
- public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
-
-// List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
- initBestPathMap(bestPath);
-
- StringBuilder sb = new StringBuilder();
- sb.append(formatHeader());
- sb.append(formatNodes(startsArray, endsArray));
- sb.append(formatTrailer());
- return sb.toString();
-
- }
-
- private void initBestPathMap(List<ViterbiNode> bestPath) {
- this.bestPathMap.clear();
-
- if (bestPath == null){
- return;
- }
- for (int i = 0; i < bestPath.size() - 1; i++) {
- ViterbiNode from = bestPath.get(i);
- ViterbiNode to = bestPath.get(i + 1);
-
- String fromId = getNodeId(from);
- String toId = getNodeId(to);
-
- assert this.bestPathMap.containsKey(fromId) == false;
- assert this.bestPathMap.containsValue(toId) == false;
- this.bestPathMap.put(fromId, toId);
- }
- }
-
- private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
- this.nodeMap.clear();
- this.foundBOS = false;
-
- StringBuilder sb = new StringBuilder();
- for (int i = 1; i < endsArray.length; i++) {
- if(endsArray[i] == null || startsArray[i] == null) {
- continue;
- }
- for (int j = 0; j < endsArray[i].length; j++) {
- ViterbiNode from = endsArray[i][j];
- if(from == null){
- continue;
- }
- sb.append(formatNodeIfNew(from));
- for (int k = 0; k < startsArray[i].length; k++) {
- ViterbiNode to = startsArray[i][k];
- if(to == null){
- break;
- }
- sb.append(formatNodeIfNew(to));
- sb.append(formatEdge(from, to));
- }
- }
- }
- return sb.toString();
- }
-
- private String formatNodeIfNew(ViterbiNode node) {
- String nodeId = getNodeId(node);
- if (! this.nodeMap.containsKey(nodeId)) {
- this.nodeMap.put(nodeId, node);
- return formatNode(node);
- } else {
- return "";
- }
- }
-
- private String formatHeader() {
- StringBuilder sb = new StringBuilder();
- sb.append("digraph viterbi {\n");
- sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
- sb.append("# A2 paper size\n");
- sb.append("size = \"34.4,16.5\";\n");
- sb.append("# try to fill paper\n");
- sb.append("ratio = fill;\n");
- sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
- sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
-
- return sb.toString();
- }
-
- private String formatTrailer() {
- return "}";
- }
-
-
- private String formatEdge(ViterbiNode from, ViterbiNode to) {
- if (this.bestPathMap.containsKey(getNodeId(from)) &&
- this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
- return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
-
- } else {
- return formatEdge(from, to, "");
- }
- }
-
-
- private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
- StringBuilder sb = new StringBuilder();
- sb.append(getNodeId(from));
- sb.append(" -> ");
- sb.append(getNodeId(to));
- sb.append(" [ ");
- sb.append("label=\"");
- sb.append(getCost(from, to));
- sb.append("\"");
- sb.append(" ");
- sb.append(attributes);
- sb.append(" ");
- sb.append(" ]");
- sb.append("\n");
- return sb.toString();
- }
-
- private String formatNode(ViterbiNode node) {
- StringBuilder sb = new StringBuilder();
- sb.append("\"");
- sb.append(getNodeId(node));
- sb.append("\"");
- sb.append(" [ ");
- sb.append("label=");
- sb.append(formatNodeLabel(node));
- sb.append(" ]");
- return sb.toString();
- }
-
- private String formatNodeLabel(ViterbiNode node) {
- StringBuilder sb = new StringBuilder();
- sb.append("<<table border=\"0\" cellborder=\"0\">");
- sb.append("<tr><td>");
- sb.append(getNodeLabel(node));
- sb.append("</td></tr>");
- sb.append("<tr><td>");
- sb.append("<font color=\"blue\">");
- sb.append(node.getWordCost());
- sb.append("</font>");
- sb.append("</td></tr>");
-// sb.append("<tr><td>");
-// sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
-// sb.append("</td></tr>");
- sb.append("</table>>");
- return sb.toString();
- }
-
- private String getNodeId(ViterbiNode node) {
- return String.valueOf(node.hashCode());
- }
-
- private String getNodeLabel(ViterbiNode node) {
- if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
- if (this.foundBOS) {
- return EOS_LABEL;
- } else {
- this.foundBOS = true;
- return BOS_LABEL;
- }
- } else {
- return node.getSurfaceForm();
- }
- }
-
- private int getCost(ViterbiNode from, ViterbiNode to) {
- return this.costs.get(from.getLeftId(), to.getRightId());
- }
+
+ private final static String BOS_LABEL = "BOS";
+
+ private final static String EOS_LABEL = "EOS";
+
+ private final static String FONT_NAME = "Helvetica";
+
+ private ConnectionCosts costs;
+
+ private Map<String, ViterbiNode> nodeMap;
+
+ private Map<String, String> bestPathMap;
+
+ private boolean foundBOS;
+
+ public GraphvizFormatter(ConnectionCosts costs) {
+ this.costs = costs;
+ this.nodeMap = new HashMap<String, ViterbiNode>();
+ this.bestPathMap = new HashMap<String, String>();
+ }
+
+ public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+ initBestPathMap(null);
+
+ StringBuilder sb = new StringBuilder();
+ sb.append(formatHeader());
+ sb.append(formatNodes(startsArray, endsArray));
+ sb.append(formatTrailer());
+ return sb.toString();
+ }
+
+ public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
+
+ // List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
+ initBestPathMap(bestPath);
+
+ StringBuilder sb = new StringBuilder();
+ sb.append(formatHeader());
+ sb.append(formatNodes(startsArray, endsArray));
+ sb.append(formatTrailer());
+ return sb.toString();
+
+ }
+
+ private void initBestPathMap(List<ViterbiNode> bestPath) {
+ this.bestPathMap.clear();
+
+ if (bestPath == null){
+ return;
+ }
+ for (int i = 0; i < bestPath.size() - 1; i++) {
+ ViterbiNode from = bestPath.get(i);
+ ViterbiNode to = bestPath.get(i + 1);
+
+ String fromId = getNodeId(from);
+ String toId = getNodeId(to);
+
+ assert this.bestPathMap.containsKey(fromId) == false;
+ assert this.bestPathMap.containsValue(toId) == false;
+ this.bestPathMap.put(fromId, toId);
+ }
+ }
+
+ private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+ this.nodeMap.clear();
+ this.foundBOS = false;
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 1; i < endsArray.length; i++) {
+ if(endsArray[i] == null || startsArray[i] == null) {
+ continue;
+ }
+ for (int j = 0; j < endsArray[i].length; j++) {
+ ViterbiNode from = endsArray[i][j];
+ if(from == null){
+ continue;
+ }
+ sb.append(formatNodeIfNew(from));
+ for (int k = 0; k < startsArray[i].length; k++) {
+ ViterbiNode to = startsArray[i][k];
+ if(to == null){
+ break;
+ }
+ sb.append(formatNodeIfNew(to));
+ sb.append(formatEdge(from, to));
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+ private String formatNodeIfNew(ViterbiNode node) {
+ String nodeId = getNodeId(node);
+ if (! this.nodeMap.containsKey(nodeId)) {
+ this.nodeMap.put(nodeId, node);
+ return formatNode(node);
+ } else {
+ return "";
+ }
+ }
+
+ private String formatHeader() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("digraph viterbi {\n");
+ sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
+ sb.append("# A2 paper size\n");
+ sb.append("size = \"34.4,16.5\";\n");
+ sb.append("# try to fill paper\n");
+ sb.append("ratio = fill;\n");
+ sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+ sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+
+ return sb.toString();
+ }
+
+ private String formatTrailer() {
+ return "}";
+ }
+
+
+ private String formatEdge(ViterbiNode from, ViterbiNode to) {
+ if (this.bestPathMap.containsKey(getNodeId(from)) &&
+ this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
+ return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
+
+ } else {
+ return formatEdge(from, to, "");
+ }
+ }
+
+
+ private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(getNodeId(from));
+ sb.append(" -> ");
+ sb.append(getNodeId(to));
+ sb.append(" [ ");
+ sb.append("label=\"");
+ sb.append(getCost(from, to));
+ sb.append("\"");
+ sb.append(" ");
+ sb.append(attributes);
+ sb.append(" ");
+ sb.append(" ]");
+ sb.append("\n");
+ return sb.toString();
+ }
+
+ private String formatNode(ViterbiNode node) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("\"");
+ sb.append(getNodeId(node));
+ sb.append("\"");
+ sb.append(" [ ");
+ sb.append("label=");
+ sb.append(formatNodeLabel(node));
+ sb.append(" ]");
+ return sb.toString();
+ }
+
+ private String formatNodeLabel(ViterbiNode node) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("<<table border=\"0\" cellborder=\"0\">");
+ sb.append("<tr><td>");
+ sb.append(getNodeLabel(node));
+ sb.append("</td></tr>");
+ sb.append("<tr><td>");
+ sb.append("<font color=\"blue\">");
+ sb.append(node.getWordCost());
+ sb.append("</font>");
+ sb.append("</td></tr>");
+ // sb.append("<tr><td>");
+ // sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
+ // sb.append("</td></tr>");
+ sb.append("</table>>");
+ return sb.toString();
+ }
+
+ private String getNodeId(ViterbiNode node) {
+ return String.valueOf(node.hashCode());
+ }
+
+ private String getNodeLabel(ViterbiNode node) {
+ if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
+ if (this.foundBOS) {
+ return EOS_LABEL;
+ } else {
+ this.foundBOS = true;
+ return BOS_LABEL;
+ }
+ } else {
+ return node.getSurfaceForm();
+ }
+ }
+
+ private int getCost(ViterbiNode from, ViterbiNode to) {
+ return this.costs.get(from.getLeftId(), to.getRightId());
+ }
}
\ No newline at end of file