You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:33:57 UTC
svn commit: r1226637 [1/3] - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucen...

Author: rmuir
Date: Tue Jan  3 04:33:56 2012
New Revision: 1226637

URL: http://svn.apache.org/viewvc?rev=1226637&view=rev
Log:
LUCENE-3305: indent 2 spaces

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java Tue Jan  3 04:33:56 2012
@@ -32,58 +32,58 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
 
 public class DebugTokenizer {
-
-	private GraphvizFormatter formatter;
-	
-	private Viterbi viterbi;
-	
-	protected DebugTokenizer(UserDictionary userDictionary, Mode mode) {
-
-		this.viterbi = new Viterbi(Dictionaries.getTrie(),
-								   Dictionaries.getDictionary(),
-								   Dictionaries.getUnknownDictionary(),
-								   Dictionaries.getCosts(),
-								   userDictionary,
-								   mode);
-		
-		this.formatter = new GraphvizFormatter(Dictionaries.getCosts());
-	}
-	
-	public String debugTokenize(String text) {
-		ViterbiNode[][][] lattice = this.viterbi.build(text);
-		List<ViterbiNode> bestPath = this.viterbi.search(lattice);
-		return this.formatter.format(lattice[0], lattice[1], bestPath);
-	}
-	
-	public static Builder builder() {
-		return new Builder();
-	}
-	
-	public static class Builder {
-
-		private Mode mode = Mode.NORMAL;
-		
-		private UserDictionary userDictionary = null;
-		
-		public synchronized Builder mode(Mode mode) {
-			this.mode = mode;
-			return this;
-		}
-		
-		public synchronized Builder userDictionary(InputStream userDictionaryInputStream)
-			throws IOException {
-			this.userDictionary = UserDictionary.read(userDictionaryInputStream);
-			return this;
-		}
-
-		public synchronized Builder userDictionary(String userDictionaryPath)
-			throws FileNotFoundException, IOException {
-			this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
-			return this;
-		}
-		
-		public synchronized DebugTokenizer build() {
-			return new DebugTokenizer(userDictionary, mode);
-		}
-	}
+  
+  private GraphvizFormatter formatter;
+  
+  private Viterbi viterbi;
+  
+  protected DebugTokenizer(UserDictionary userDictionary, Mode mode) {
+    
+    this.viterbi = new Viterbi(Dictionaries.getTrie(),
+        Dictionaries.getDictionary(),
+        Dictionaries.getUnknownDictionary(),
+        Dictionaries.getCosts(),
+        userDictionary,
+        mode);
+    
+    this.formatter = new GraphvizFormatter(Dictionaries.getCosts());
+  }
+  
+  public String debugTokenize(String text) {
+    ViterbiNode[][][] lattice = this.viterbi.build(text);
+    List<ViterbiNode> bestPath = this.viterbi.search(lattice);
+    return this.formatter.format(lattice[0], lattice[1], bestPath);
+  }
+  
+  public static Builder builder() {
+    return new Builder();
+  }
+  
+  public static class Builder {
+    
+    private Mode mode = Mode.NORMAL;
+    
+    private UserDictionary userDictionary = null;
+    
+    public synchronized Builder mode(Mode mode) {
+      this.mode = mode;
+      return this;
+    }
+    
+    public synchronized Builder userDictionary(InputStream userDictionaryInputStream)
+        throws IOException {
+      this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+      return this;
+    }
+    
+    public synchronized Builder userDictionary(String userDictionaryPath)
+        throws FileNotFoundException, IOException {
+      this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+      return this;
+    }
+    
+    public synchronized DebugTokenizer build() {
+      return new DebugTokenizer(userDictionary, mode);
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Tue Jan  3 04:33:56 2012
@@ -21,90 +21,90 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
 public class Token {
-	private final Dictionary dictionary;
-
-	private final int wordId;
-	
-	private final String surfaceForm;
-	
-	private final int position;
-	
-	private final Type type;
-	
-	public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
-		this.wordId = wordId;
-		this.surfaceForm = surfaceForm;
-		this.type = type;
-		this.position = position;
-		this.dictionary = dictionary;
-	}
-
-	/**
-	 * @return surfaceForm
-	 */
-	public String getSurfaceForm() {
-		return surfaceForm;
-	}
-
-	/**
-	 * @return all features
-	 */
-	public String getAllFeatures() {
-		return dictionary.getAllFeatures(wordId);
-	}
-
-	/**
-	 * @return all features as array
-	 */
-	public String[] getAllFeaturesArray() {
-		return dictionary.getAllFeaturesArray(wordId);
-	}
-
-
-	/**
-	 * @return reading. null if token doesn't have reading.
-	 */
-	public String getReading() {
-		return dictionary.getReading(wordId);
-	}
-
-	/**
-	 * @return part of speech.
-	 */
-	public String getPartOfSpeech() {
-		return dictionary.getPartOfSpeech(wordId);
-	}
-
-	/**
-	 * Returns true if this token is known word
-	 * @return true if this token is in standard dictionary. false if not.
-	 */
-	public boolean isKnown() {
-		return type == Type.KNOWN;
-	}
-
-	/**
-	 * Returns true if this token is unknown word
-	 * @return true if this token is unknown word. false if not.
-	 */
-	public boolean isUnknown() {
-		return type == Type.UNKNOWN;
-	}
-	
-	/**
-	 * Returns true if this token is defined in user dictionary
-	 * @return true if this token is in user dictionary. false if not.
-	 */
-	public boolean isUser() {
-		return type == Type.USER;
-	}
-	
-	/**
-	 * Get index of this token in input text
-	 * @return position of token
-	 */
-	public int getPosition() {
-		return position;
-	}
-
+  private final Dictionary dictionary;
+  
+  private final int wordId;
+  
+  private final String surfaceForm;
+  
+  private final int position;
+  
+  private final Type type;
+  
+  public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
+    this.wordId = wordId;
+    this.surfaceForm = surfaceForm;
+    this.type = type;
+    this.position = position;
+    this.dictionary = dictionary;
+  }
+  
+  /**
+   * @return surfaceForm
+   */
+  public String getSurfaceForm() {
+    return surfaceForm;
+  }
+  
+  /**
+   * @return all features
+   */
+  public String getAllFeatures() {
+    return dictionary.getAllFeatures(wordId);
+  }
+  
+  /**
+   * @return all features as array
+   */
+  public String[] getAllFeaturesArray() {
+    return dictionary.getAllFeaturesArray(wordId);
+  }
+  
+  
+  /**
+   * @return reading. null if token doesn't have reading.
+   */
+  public String getReading() {
+    return dictionary.getReading(wordId);
+  }
+  
+  /**
+   * @return part of speech.
+   */
+  public String getPartOfSpeech() {
+    return dictionary.getPartOfSpeech(wordId);
+  }
+  
+  /**
+   * Returns true if this token is known word
+   * @return true if this token is in standard dictionary. false if not.
+   */
+  public boolean isKnown() {
+    return type == Type.KNOWN;
+  }
+  
+  /**
+   * Returns true if this token is unknown word
+   * @return true if this token is unknown word. false if not.
+   */
+  public boolean isUnknown() {
+    return type == Type.UNKNOWN;
+  }
+  
+  /**
+   * Returns true if this token is defined in user dictionary
+   * @return true if this token is in user dictionary. false if not.
+   */
+  public boolean isUser() {
+    return type == Type.USER;
+  }
+  
+  /**
+   * Get index of this token in input text
+   * @return position of token
+   */
+  public int getPosition() {
+    return position;
+  }
+  
 }
\ No newline at end of file

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Tue Jan  3 04:33:56 2012
@@ -38,201 +38,201 @@ import org.apache.lucene.analysis.kuromo
  * Thread safe.
  */
 public class Tokenizer {
-	public enum Mode {
-		NORMAL, SEARCH, EXTENDED
-	}
-	
-	private final Viterbi viterbi;
-	
-	private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
-	
-	private final boolean split;
-	
-	/**
-	 * Constructor
-	 * @param dictionary
-	 * @param costs
-	 * @param trie
-	 * @param unkDictionary
-	 * @param userDictionary
-	 * @param mode
-	 */
-	protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
-
-		this.viterbi = new Viterbi(Dictionaries.getTrie(),
-				                   Dictionaries.getDictionary(),
-				                   Dictionaries.getUnknownDictionary(),
-				                   Dictionaries.getCosts(),
-				                   userDictionary,
-				                   mode);
-
-		this.split = split;
-		
-		dictionaryMap.put(Type.KNOWN, Dictionaries.getDictionary());
-		dictionaryMap.put(Type.UNKNOWN, Dictionaries.getUnknownDictionary());
-		dictionaryMap.put(Type.USER, userDictionary);
-	}
-
-	/**
-	 * Tokenize input text
-	 * @param text
-	 * @return list of Token
-	 */
-	public List<Token> tokenize(String text) {
-
-		if (!split) {
-			return doTokenize(0, text);			
-		}
-		
-		List<Integer> splitPositions = getSplitPositions(text);
-
-		if(splitPositions.size() == 0) {
-			return doTokenize(0, text);
-		}
-		
-		ArrayList<Token> result = new ArrayList<Token>();
-		int offset = 0;
-		for(int position : splitPositions) {
-			result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
-			offset = position + 1;
-		}
-		
-		if(offset < text.length()) {
-			result.addAll(doTokenize(offset, text.substring(offset)));
-		}
-		
-		return result;
-	}
-	
-	/**
-	 * Split input text at å¥èªç¹, which is ã and ã
-	 * @param text
-	 * @return list of split position
-	 */
-	private List<Integer> getSplitPositions(String text) {
-		ArrayList<Integer> splitPositions = new ArrayList<Integer>();
-		
-		int position = 0;
-		int currentPosition = 0;
-
-		while(true) {
-			int indexOfMaru = text.indexOf("ã", currentPosition);
-			int indexOfTen = text.indexOf("ã", currentPosition);
-			
-			if(indexOfMaru < 0 || indexOfTen < 0) {
-				position = Math.max(indexOfMaru, indexOfTen);;
-			} else {
-				position = Math.min(indexOfMaru, indexOfTen);				
-			}
-			
-			if(position >= 0) {
-				splitPositions.add(position);
-				currentPosition = position + 1;
-			} else {
-				break;
-			}
-		}
-		
-		return splitPositions;
-	}
-	
-	/**
-	 * Tokenize input sentence.
-	 * @param offset offset of sentence in original input text
-	 * @param sentence sentence to tokenize
-	 * @return list of Token
-	 */
-	private List<Token> doTokenize(int offset, String sentence) {
-		ArrayList<Token> result = new ArrayList<Token>();
-		
-		ViterbiNode[][][] lattice = viterbi.build(sentence);
-		List<ViterbiNode> bestPath = viterbi.search(lattice);
-		for (ViterbiNode node : bestPath) {
-			int wordId = node.getWordId();
-			if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS 
-				continue;
-			}
-			Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
-			result.add(token);
-		}
-		
-		return result;
-	}
-	
-	/**
-	 * Get Builder to create Tokenizer instance.
-	 * @return Builder
-	 */
-	public static Builder builder() {
-		return new Builder();
-	}
-	
-	/**
-	 * Builder class used to create Tokenizer instance.
-	 */
-	public static class Builder {
-
-		private Mode mode = Mode.NORMAL;
-		
-		private boolean split = true;
-
-		private UserDictionary userDictionary = null;
-		
-		/**
-		 * Set tokenization mode
-		 * Default: NORMAL
-		 * @param mode tokenization mode
-		 * @return Builder
-		 */
-		public synchronized Builder mode(Mode mode) {
-			this.mode = mode;
-			return this;
-		}
-		
-		/**
-		 * Set if tokenizer should split input string at "ã" and "ã" before tokenize to increase performance.
-		 * Splitting shouldn't change the result of tokenization most of the cases.
-		 * Default: true
-		 * 
-		 * @param split whether tokenizer should split input string
-		 * @return Builder
-		 */
-		public synchronized Builder split(boolean split) {
-			this.split = split;
-			return this;
-		}
-		
-		/**
-		 * Set user dictionary input stream
-		 * @param userDictionaryInputStream dictionary file as input stream
-		 * @return Builder
-		 * @throws IOException 
-		 */
-		public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
-			this.userDictionary = UserDictionary.read(userDictionaryInputStream);
-			return this;
-		}
-		
-		/**
-		 * Set user dictionary path
-		 * @param userDictionaryPath path to dictionary file
-		 * @return Builder
-		 * @throws IOException 
-		 * @throws FileNotFoundException 
-		 */
-		public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
-			if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
-				this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
-			}
-			return this;
-		}
-		
-		/**
-		 * Create Tokenizer instance
-		 * @return Tokenizer
-		 */
-		public synchronized Tokenizer build() {
-			return new Tokenizer(userDictionary, mode, split);
-		}
-	}
+  public enum Mode {
+    NORMAL, SEARCH, EXTENDED
+  }
+  
+  private final Viterbi viterbi;
+  
+  private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+  
+  private final boolean split;
+  
+  /**
+   * Constructor
+   * @param dictionary
+   * @param costs
+   * @param trie
+   * @param unkDictionary
+   * @param userDictionary
+   * @param mode
+   */
+  protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
+    
+    this.viterbi = new Viterbi(Dictionaries.getTrie(),
+        Dictionaries.getDictionary(),
+        Dictionaries.getUnknownDictionary(),
+        Dictionaries.getCosts(),
+        userDictionary,
+        mode);
+    
+    this.split = split;
+    
+    dictionaryMap.put(Type.KNOWN, Dictionaries.getDictionary());
+    dictionaryMap.put(Type.UNKNOWN, Dictionaries.getUnknownDictionary());
+    dictionaryMap.put(Type.USER, userDictionary);
+  }
+  
+  /**
+   * Tokenize input text
+   * @param text
+   * @return list of Token
+   */
+  public List<Token> tokenize(String text) {
+    
+    if (!split) {
+      return doTokenize(0, text);			
+    }
+    
+    List<Integer> splitPositions = getSplitPositions(text);
+    
+    if(splitPositions.size() == 0) {
+      return doTokenize(0, text);
+    }
+    
+    ArrayList<Token> result = new ArrayList<Token>();
+    int offset = 0;
+    for(int position : splitPositions) {
+      result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
+      offset = position + 1;
+    }
+    
+    if(offset < text.length()) {
+      result.addAll(doTokenize(offset, text.substring(offset)));
+    }
+    
+    return result;
+  }
+  
+  /**
+   * Split input text at å¥èªç¹, which is ã and ã
+   * @param text
+   * @return list of split position
+   */
+  private List<Integer> getSplitPositions(String text) {
+    ArrayList<Integer> splitPositions = new ArrayList<Integer>();
+    
+    int position = 0;
+    int currentPosition = 0;
+    
+    while(true) {
+      int indexOfMaru = text.indexOf("ã", currentPosition);
+      int indexOfTen = text.indexOf("ã", currentPosition);
+      
+      if(indexOfMaru < 0 || indexOfTen < 0) {
+        position = Math.max(indexOfMaru, indexOfTen);;
+      } else {
+        position = Math.min(indexOfMaru, indexOfTen);				
+      }
+      
+      if(position >= 0) {
+        splitPositions.add(position);
+        currentPosition = position + 1;
+      } else {
+        break;
+      }
+    }
+    
+    return splitPositions;
+  }
+  
+  /**
+   * Tokenize input sentence.
+   * @param offset offset of sentence in original input text
+   * @param sentence sentence to tokenize
+   * @return list of Token
+   */
+  private List<Token> doTokenize(int offset, String sentence) {
+    ArrayList<Token> result = new ArrayList<Token>();
+    
+    ViterbiNode[][][] lattice = viterbi.build(sentence);
+    List<ViterbiNode> bestPath = viterbi.search(lattice);
+    for (ViterbiNode node : bestPath) {
+      int wordId = node.getWordId();
+      if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS 
+        continue;
+      }
+      Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
+      result.add(token);
+    }
+    
+    return result;
+  }
+  
+  /**
+   * Get Builder to create Tokenizer instance.
+   * @return Builder
+   */
+  public static Builder builder() {
+    return new Builder();
+  }
+  
+  /**
+   * Builder class used to create Tokenizer instance.
+   */
+  public static class Builder {
+    
+    private Mode mode = Mode.NORMAL;
+    
+    private boolean split = true;
+    
+    private UserDictionary userDictionary = null;
+    
+    /**
+     * Set tokenization mode
+     * Default: NORMAL
+     * @param mode tokenization mode
+     * @return Builder
+     */
+    public synchronized Builder mode(Mode mode) {
+      this.mode = mode;
+      return this;
+    }
+    
+    /**
+     * Set if tokenizer should split input string at "ã" and "ã" before tokenize to increase performance.
+     * Splitting shouldn't change the result of tokenization most of the cases.
+     * Default: true
+     * 
+     * @param split whether tokenizer should split input string
+     * @return Builder
+     */
+    public synchronized Builder split(boolean split) {
+      this.split = split;
+      return this;
+    }
+    
+    /**
+     * Set user dictionary input stream
+     * @param userDictionaryInputStream dictionary file as input stream
+     * @return Builder
+     * @throws IOException 
+     */
+    public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
+      this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+      return this;
+    }
+    
+    /**
+     * Set user dictionary path
+     * @param userDictionaryPath path to dictionary file
+     * @return Builder
+     * @throws IOException 
+     * @throws FileNotFoundException 
+     */
+    public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
+      if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
+        this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+      }
+      return this;
+    }
+    
+    /**
+     * Create Tokenizer instance
+     * @return Tokenizer
+     */
+    public synchronized Tokenizer build() {
+      return new Tokenizer(userDictionary, mode, split);
+    }
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java Tue Jan  3 04:33:56 2012
@@ -21,78 +21,78 @@ import java.io.Serializable;
 import java.util.EnumMap;
 
 public final class CharacterDefinition implements Serializable {
-	private static final long serialVersionUID = -1436753619176638532L;
-	
-	private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
-
-	private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
-		new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
-
-	public enum CharacterClass {
-		NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
-
-		public int getId() {
-			return ordinal();
-		}
-	}
-
-	/**
-	 * Constructor
-	 */
-	public CharacterDefinition() {
-		for (int i = 0; i < characterCategoryMap.length; i++) {
-			characterCategoryMap[i] = CharacterClass.DEFAULT;
-		}
-	}
-
-	public int lookup(char c) {
-		return characterCategoryMap[c].getId();
-	}
-
-	public CharacterClass getCharacterClass(char c) {
-		return characterCategoryMap[c];
-	}
-
-	public boolean isInvoke(char c) {
-		CharacterClass characterClass = characterCategoryMap[c];
-		int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
-		return invokeDefinition[0] == 1;
-	}
-
-	public boolean isGroup(char c) {
-		CharacterClass characterClass = characterCategoryMap[c];
-		int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
-		return invokeDefinition[1] == 1;
-	}
-
-	public boolean isKanji(char c) {
-		return characterCategoryMap[c] == CharacterClass.KANJI ||
-			   characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
-	}
-
-	/**
-	 * Put mapping from unicode code point to character class.
-	 * 
-	 * @param codePoint
-	 *            code point
-	 * @param class character class name
-	 */
-	public void putCharacterCategory(int codePoint, String characterClassName) {
-		characterClassName = characterClassName.split(" ")[0]; // use first
-																// category
-																// class
-
-		// Override Nakaguro
-		if (codePoint == 0x30FB) {
-			characterClassName = "SYMBOL";
-		}
-		characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
-	}
-
-	public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
-		CharacterClass characterClass = CharacterClass
-				.valueOf(characterClassName);
-		int[] values = { invoke, group, length };
-		invokeDefinitionMap.put(characterClass, values);
-	}
+  private static final long serialVersionUID = -1436753619176638532L;
+  
+  private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
+  
+  private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
+      new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
+      
+      public enum CharacterClass {
+        NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
+        
+        public int getId() {
+          return ordinal();
+        }
+      }
+      
+      /**
+       * Constructor
+       */
+      public CharacterDefinition() {
+        for (int i = 0; i < characterCategoryMap.length; i++) {
+          characterCategoryMap[i] = CharacterClass.DEFAULT;
+        }
+      }
+      
+      public int lookup(char c) {
+        return characterCategoryMap[c].getId();
+      }
+      
+      public CharacterClass getCharacterClass(char c) {
+        return characterCategoryMap[c];
+      }
+      
+      public boolean isInvoke(char c) {
+        CharacterClass characterClass = characterCategoryMap[c];
+        int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+        return invokeDefinition[0] == 1;
+      }
+      
+      public boolean isGroup(char c) {
+        CharacterClass characterClass = characterCategoryMap[c];
+        int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+        return invokeDefinition[1] == 1;
+      }
+      
+      public boolean isKanji(char c) {
+        return characterCategoryMap[c] == CharacterClass.KANJI ||
+            characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
+      }
+      
+      /**
+       * Put mapping from unicode code point to character class.
+       * 
+       * @param codePoint
+       *            code point
+       * @param class character class name
+       */
+      public void putCharacterCategory(int codePoint, String characterClassName) {
+        characterClassName = characterClassName.split(" ")[0]; // use first
+        // category
+        // class
+        
+        // Override Nakaguro
+        if (codePoint == 0x30FB) {
+          characterClassName = "SYMBOL";
+        }
+        characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
+      }
+      
+      public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+        CharacterClass characterClass = CharacterClass
+            .valueOf(characterClassName);
+        int[] values = { invoke, group, length };
+        invokeDefinitionMap.put(characterClass, values);
+      }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java Tue Jan  3 04:33:56 2012
@@ -28,53 +28,53 @@ import java.io.ObjectOutputStream;
 import java.io.Serializable;
 
 public class ConnectionCosts implements Serializable{
-
-	private static final long serialVersionUID = -7704592689635266457L;
-
-	public static final String FILENAME = "cc.dat";
-		
-	private short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
-	
-	public ConnectionCosts() {
-		
-	}
-	
-	public ConnectionCosts(int forwardSize, int backwardSize) {
-		this.costs = new short[backwardSize][forwardSize]; 
-	}
-
-	public void add(int forwardId, int backwardId, int cost) {
-		this.costs[backwardId][forwardId] = (short)cost;
-	}
-	
-	public int get(int forwardId, int backwardId) {
-		// FIXME: There seems to be something wrong with the double array trie in some rare
-		// cases causing and IndexOutOfBoundsException.  Use a guard as a temporary work-around
-		// and return a high cost to advise Mr. Viterbi strongly to not use this transition
-		if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
-	    	return costs[backwardId][forwardId];
-	    } else {
-	    	return 50000;
-	    }
-	}
-
-	public void write(String directoryname) throws IOException {
-		String filename = directoryname + File.separator + FILENAME;
-		ObjectOutputStream outputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
-		outputStream.writeObject(this);
-		outputStream.close();
-	}
-
-	public static ConnectionCosts getInstance() throws IOException, ClassNotFoundException {
-		InputStream is = ConnectionCosts.class.getClassLoader().getResourceAsStream(FILENAME);
-		return read(is);
-	}
-	
-	public static ConnectionCosts read(InputStream is) throws IOException, ClassNotFoundException {
-		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
-		ConnectionCosts instance = (ConnectionCosts) ois.readObject();
-		ois.close();
-		return instance;
-	}
-
+  
+  private static final long serialVersionUID = -7704592689635266457L;
+  
+  public static final String FILENAME = "cc.dat";
+  
+  private short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
+  
+  public ConnectionCosts() {
+    
+  }
+  
+  public ConnectionCosts(int forwardSize, int backwardSize) {
+    this.costs = new short[backwardSize][forwardSize]; 
+  }
+  
+  public void add(int forwardId, int backwardId, int cost) {
+    this.costs[backwardId][forwardId] = (short)cost;
+  }
+  
+  public int get(int forwardId, int backwardId) {
+    // FIXME: There seems to be something wrong with the double array trie in some rare
+    // cases causing and IndexOutOfBoundsException.  Use a guard as a temporary work-around
+    // and return a high cost to advise Mr. Viterbi strongly to not use this transition
+    if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
+      return costs[backwardId][forwardId];
+    } else {
+      return 50000;
+    }
+  }
+  
+  public void write(String directoryname) throws IOException {
+    String filename = directoryname + File.separator + FILENAME;
+    ObjectOutputStream outputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+    outputStream.writeObject(this);
+    outputStream.close();
+  }
+  
+  public static ConnectionCosts getInstance() throws IOException, ClassNotFoundException {
+    InputStream is = ConnectionCosts.class.getClassLoader().getResourceAsStream(FILENAME);
+    return read(is);
+  }
+  
+  public static ConnectionCosts read(InputStream is) throws IOException, ClassNotFoundException {
+    ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+    ConnectionCosts instance = (ConnectionCosts) ois.readObject();
+    ois.close();
+    return instance;
+  }
+  
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java Tue Jan  3 04:33:56 2012
@@ -20,91 +20,91 @@ package org.apache.lucene.analysis.kurom
 import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
 
 public final class Dictionaries {
-
-	private static TokenInfoDictionary dictionary;
-
-	private static UnknownDictionary unknownDictionary;
-
-	private static ConnectionCosts costs;
-
-	private static DoubleArrayTrie trie;
-	
-	private static boolean initialized = false;
-	
-	static {
-		load();
-	}
-
-	private static synchronized void load() {
-
-		if (Dictionaries.initialized) {
-			return;
-		}
-
-		try {
-			Dictionaries.dictionary = TokenInfoDictionary.getInstance();
-			Dictionaries.unknownDictionary = UnknownDictionary.getInstance();
-			Dictionaries.costs = ConnectionCosts.getInstance();
-			Dictionaries.trie = DoubleArrayTrie.getInstance();
-			Dictionaries.initialized = true;
-		} catch (Exception ex) {
-			throw new RuntimeException("Could not load dictionaries!  Ouch, ouch, ouch...", ex);
-		}
-	}
-
-	/**
-	 * @return the dictionary
-	 */
-	public static TokenInfoDictionary getDictionary() {
-		return dictionary;
-	}
-
-	/**
-	 * @param dictionary the dictionary to set
-	 */
-	public static void setDictionary(TokenInfoDictionary dictionary) {
-		Dictionaries.dictionary = dictionary;
-	}
-
-	/**
-	 * @return the unknownDictionary
-	 */
-	public static UnknownDictionary getUnknownDictionary() {
-		return unknownDictionary;
-	}
-
-	/**
-	 * @param unknownDictionary the unknownDictionary to set
-	 */
-	public static void setUnknownDictionary(UnknownDictionary unknownDictionary) {
-		Dictionaries.unknownDictionary = unknownDictionary;
-	}
-
-	/**
-	 * @return the costs
-	 */
-	public static ConnectionCosts getCosts() {
-		return costs;
-	}
-
-	/**
-	 * @param costs the costs to set
-	 */
-	public static void setCosts(ConnectionCosts costs) {
-		Dictionaries.costs = costs;
-	}
-
-	/**
-	 * @return the trie
-	 */
-	public static DoubleArrayTrie getTrie() {
-		return trie;
-	}
-
-	/**
-	 * @param trie the trie to set
-	 */
-	public static void setTrie(DoubleArrayTrie trie) {
-		Dictionaries.trie = trie;
-	}
+  
+  private static TokenInfoDictionary dictionary;
+  
+  private static UnknownDictionary unknownDictionary;
+  
+  private static ConnectionCosts costs;
+  
+  private static DoubleArrayTrie trie;
+  
+  private static boolean initialized = false;
+  
+  static {
+    load();
+  }
+  
+  private static synchronized void load() {
+    
+    if (Dictionaries.initialized) {
+      return;
+    }
+    
+    try {
+      Dictionaries.dictionary = TokenInfoDictionary.getInstance();
+      Dictionaries.unknownDictionary = UnknownDictionary.getInstance();
+      Dictionaries.costs = ConnectionCosts.getInstance();
+      Dictionaries.trie = DoubleArrayTrie.getInstance();
+      Dictionaries.initialized = true;
+    } catch (Exception ex) {
+      throw new RuntimeException("Could not load dictionaries!  Ouch, ouch, ouch...", ex);
+    }
+  }
+  
+  /**
+   * @return the dictionary
+   */
+  public static TokenInfoDictionary getDictionary() {
+    return dictionary;
+  }
+  
+  /**
+   * @param dictionary the dictionary to set
+   */
+  public static void setDictionary(TokenInfoDictionary dictionary) {
+    Dictionaries.dictionary = dictionary;
+  }
+  
+  /**
+   * @return the unknownDictionary
+   */
+  public static UnknownDictionary getUnknownDictionary() {
+    return unknownDictionary;
+  }
+  
+  /**
+   * @param unknownDictionary the unknownDictionary to set
+   */
+  public static void setUnknownDictionary(UnknownDictionary unknownDictionary) {
+    Dictionaries.unknownDictionary = unknownDictionary;
+  }
+  
+  /**
+   * @return the costs
+   */
+  public static ConnectionCosts getCosts() {
+    return costs;
+  }
+  
+  /**
+   * @param costs the costs to set
+   */
+  public static void setCosts(ConnectionCosts costs) {
+    Dictionaries.costs = costs;
+  }
+  
+  /**
+   * @return the trie
+   */
+  public static DoubleArrayTrie getTrie() {
+    return trie;
+  }
+  
+  /**
+   * @param trie the trie to set
+   */
+  public static void setTrie(DoubleArrayTrie trie) {
+    Dictionaries.trie = trie;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Tue Jan  3 04:33:56 2012
@@ -18,63 +18,63 @@ package org.apache.lucene.analysis.kurom
  */
 
 public interface Dictionary {
-
-	public static final String INTERNAL_SEPARATOR = "\u0000";
-
-	/**
-	 * Get left id of specified word
-	 * @param wordId
-	 * @return	left id
-	 */
-	public int getLeftId(int wordId);
-	
-	/**
-	 * Get right id of specified word
-	 * @param wordId
-	 * @return	left id
-	 */
-	public int getRightId(int wordId);
-	
-	/**
-	 * Get word cost of specified word
-	 * @param wordId
-	 * @return	left id
-	 */
-	public int getWordCost(int wordId);
-
-	/**
-	 * Get all features of tokens
-	 * @param wordId word ID of token
-	 * @return All features of the token
-	 */
-	public String getAllFeatures(int wordId);
-
-	/**
-	 * Get all features as array
-	 * @param wordId word ID of token
-	 * @return Array containing all features of the token
-	 */
-	public String[] getAllFeaturesArray(int wordId);
-
-	/**
-	 * Get Part-Of-Speech of tokens
-	 * @param wordId word ID of token
-	 * @return Part-Of-Speech of the token
-	 */
-	public String getPartOfSpeech(int wordId);
-
-	/**
-	 * Get reading of tokens
-	 * @param wordId word ID of token
-	 * @return Reading of the token
-	 */
-	public String getReading(int wordId);
-	
-	/**
-	 * Get feature(s) of tokens
-	 * @param wordId word ID token
-	 * @param fields array of index. If this is empty, return all features.
-	 * @return Features of the token
-	 */
-	public String getFeature(int wordId, int... fields);
+  
+  public static final String INTERNAL_SEPARATOR = "\u0000";
+  
+  /**
+   * Get left id of specified word
+   * @param wordId
+   * @return	left id
+   */
+  public int getLeftId(int wordId);
+  
+  /**
+   * Get right id of specified word
+   * @param wordId
+   * @return	left id
+   */
+  public int getRightId(int wordId);
+  
+  /**
+   * Get word cost of specified word
+   * @param wordId
+   * @return	left id
+   */
+  public int getWordCost(int wordId);
+  
+  /**
+   * Get all features of tokens
+   * @param wordId word ID of token
+   * @return All features of the token
+   */
+  public String getAllFeatures(int wordId);
+  
+  /**
+   * Get all features as array
+   * @param wordId word ID of token
+   * @return Array containing all features of the token
+   */
+  public String[] getAllFeaturesArray(int wordId);
+  
+  /**
+   * Get Part-Of-Speech of tokens
+   * @param wordId word ID of token
+   * @return Part-Of-Speech of the token
+   */
+  public String getPartOfSpeech(int wordId);
+  
+  /**
+   * Get reading of tokens
+   * @param wordId word ID of token
+   * @return Reading of the token
+   */
+  public String getReading(int wordId);
+  
+  /**
+   * Get feature(s) of tokens
+   * @param wordId word ID token
+   * @param fields array of index. If this is empty, return all features.
+   * @return Features of the token
+   */
+  public String getFeature(int wordId, int... fields);
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Tue Jan  3 04:33:56 2012
@@ -35,210 +35,210 @@ import java.nio.channels.WritableByteCha
 import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
 
 public class TokenInfoDictionary implements Dictionary{
-
-	public static final String FILENAME = "tid.dat";
-
-	public static final String TARGETMAP_FILENAME = "tid_map.dat";
-
-	protected ByteBuffer buffer;
-	
-	protected int[][] targetMap;
-
-	public TokenInfoDictionary() {
-	}
-	
-	public TokenInfoDictionary(int size) {
-		targetMap = new int[1][];
-		buffer = ByteBuffer.allocate(size);
-	}
-
-	/**
-	 * put the entry in map
-	 * @param wordId
-	 * @param entry
-	 * @return current position of buffer, which will be wordId of next entry
-	 */
-	public int put(String[] entry) {
-		short leftId = Short.parseShort(entry[1]);
-		short rightId = Short.parseShort(entry[2]);
-		short wordCost = Short.parseShort(entry[3]);
-
-		StringBuilder sb = new StringBuilder();
-		for (int i = 4; i < entry.length; i++){
-			sb.append(entry[i]).append(INTERNAL_SEPARATOR);
-		}
-		String features = sb.deleteCharAt(sb.length() - 1).toString();
-		int featuresSize = features.length()* 2;
-
-		// extend buffer if necessary
-		int left = buffer.limit() - buffer.position();
-		if (8 + featuresSize > left) { // four short and features
-			ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
-			buffer.flip();
-			newBuffer.put(buffer);
-			buffer = newBuffer;
-		}
-		
-		buffer.putShort(leftId);
-		buffer.putShort(rightId);
-		buffer.putShort(wordCost);
-		buffer.putShort((short)featuresSize);
-		for (char c : features.toCharArray()){
-			buffer.putChar(c);
-		}
-
-		return buffer.position();
-	}
-
-	public void addMapping(int sourceId, int wordId) {
-		if(targetMap.length <= sourceId) {
-			int[][] newArray = new int[sourceId + 1][];
-			System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
-			targetMap = newArray;
-		}
-		
-		// Prepare array -- extend the length of array by one
-		int[] current = targetMap[sourceId];
-		if (current == null) {
-			current = new int[1];
-		} else {
-			int[] newArray = new int[current.length + 1];
-			System.arraycopy(current, 0, newArray, 0, current.length);
-			current = newArray;
-		}
-		targetMap[sourceId] = current;
-
-		int[] targets = targetMap[sourceId];
-		targets[targets.length - 1] = wordId;
-	}
-	
-	public int[] lookupWordIds(int sourceId) {
-		return targetMap[sourceId];
-	}
-	
-	@Override	
-	public int getLeftId(int wordId) {
-		return buffer.getShort(wordId);
-	}
-
-	@Override
-	public int getRightId(int wordId) {
-		return buffer.getShort(wordId + 2);	// Skip left id
-	}
-	
-	@Override
-	public int getWordCost(int wordId) {
-		return buffer.getShort(wordId + 4);	// Skip left id and right id
-	}
-
-	@Override
-	public String[] getAllFeaturesArray(int wordId) {
-		int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
-		char[] targetArr = new char[size];
-		int offset = wordId + 6 + 2; // offset is position where features string starts
-		for(int i = 0; i < size; i++){
-			targetArr[i] = buffer.getChar(offset + i * 2);
-		}
-		String allFeatures = new String(targetArr);
-		return allFeatures.split(INTERNAL_SEPARATOR);
-	}
-	
-	@Override
-	public String getFeature(int wordId, int... fields) {
-		String[] allFeatures = getAllFeaturesArray(wordId);
-		StringBuilder sb = new StringBuilder();
-		
-		if(fields.length == 0){ // All features
-			for(String feature : allFeatures) {
-				sb.append(CSVUtil.quoteEscape(feature)).append(",");
-			}
-		} else if(fields.length == 1) { // One feature doesn't need to escape value
-			sb.append(allFeatures[fields[0]]).append(",");			
-		} else {
-			for(int field : fields){
-				sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
-			}
-		}
-		
-		return sb.deleteCharAt(sb.length() - 1).toString();
-	}
-	
-	@Override
-	public String getReading(int wordId) {
-		return getFeature(wordId, 7);
-	}
-
-	@Override
-	public String getAllFeatures(int wordId) {
-		return getFeature(wordId);
-	}
-
-	@Override
-	public String getPartOfSpeech(int wordId) {
-		return getFeature(wordId, 0, 1, 2, 3);
-	}
-	
-
-	/**
-	 * Write dictionary in file
-	 * Dictionary format is:
-	 * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
-	 * @param filename
-	 * @throws IOException
-	 */
-	public void write(String directoryname) throws IOException {
-		writeDictionary(directoryname + File.separator + FILENAME);
-		writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
-	}
-
-	protected void writeTargetMap(String filename) throws IOException {
-		ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));		
-		oos.writeObject(targetMap);
-		oos.close();
-	}
-	
-	protected void writeDictionary(String filename) throws IOException {
-		FileOutputStream fos = new FileOutputStream(filename);
-		DataOutputStream dos = new DataOutputStream(fos);
-		dos.writeInt(buffer.position());
-		WritableByteChannel channel = Channels.newChannel(fos);
-		// Write Buffer
-		buffer.flip();  // set position to 0, set limit to current position
-		channel.write(buffer);
-		
-		fos.close();
-	}
-	
-	/**
-	 * Read dictionary into directly allocated buffer.
-	 * @return TokenInfoDictionary instance
-	 * @throws IOException
-	 * @throws ClassNotFoundException 
-	 */
-	public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
-		TokenInfoDictionary dictionary = new TokenInfoDictionary();
-		ClassLoader loader = dictionary.getClass().getClassLoader();
-		dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
-		dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
-		return dictionary;
-	}
-	
-	protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
-		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
-		targetMap = (int[][]) ois.readObject();
-		is.close();
-	}
-	
-	protected void loadDictionary(InputStream is) throws IOException {
-		DataInputStream dis = new DataInputStream(is);
-		int size = dis.readInt();
-		
-		ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
-
-		ReadableByteChannel channel = Channels.newChannel(is);
-		channel.read(tmpBuffer);
-		is.close();
-		buffer = tmpBuffer.asReadOnlyBuffer();
-	}
-
+  
+  public static final String FILENAME = "tid.dat";
+  
+  public static final String TARGETMAP_FILENAME = "tid_map.dat";
+  
+  protected ByteBuffer buffer;
+  
+  protected int[][] targetMap;
+  
+  public TokenInfoDictionary() {
+  }
+  
+  public TokenInfoDictionary(int size) {
+    targetMap = new int[1][];
+    buffer = ByteBuffer.allocate(size);
+  }
+  
+  /**
+   * put the entry in map
+   * @param wordId
+   * @param entry
+   * @return current position of buffer, which will be wordId of next entry
+   */
+  public int put(String[] entry) {
+    short leftId = Short.parseShort(entry[1]);
+    short rightId = Short.parseShort(entry[2]);
+    short wordCost = Short.parseShort(entry[3]);
+    
+    StringBuilder sb = new StringBuilder();
+    for (int i = 4; i < entry.length; i++){
+      sb.append(entry[i]).append(INTERNAL_SEPARATOR);
+    }
+    String features = sb.deleteCharAt(sb.length() - 1).toString();
+    int featuresSize = features.length()* 2;
+    
+    // extend buffer if necessary
+    int left = buffer.limit() - buffer.position();
+    if (8 + featuresSize > left) { // four short and features
+      ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
+      buffer.flip();
+      newBuffer.put(buffer);
+      buffer = newBuffer;
+    }
+    
+    buffer.putShort(leftId);
+    buffer.putShort(rightId);
+    buffer.putShort(wordCost);
+    buffer.putShort((short)featuresSize);
+    for (char c : features.toCharArray()){
+      buffer.putChar(c);
+    }
+    
+    return buffer.position();
+  }
+  
+  public void addMapping(int sourceId, int wordId) {
+    if(targetMap.length <= sourceId) {
+      int[][] newArray = new int[sourceId + 1][];
+      System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
+      targetMap = newArray;
+    }
+    
+    // Prepare array -- extend the length of array by one
+    int[] current = targetMap[sourceId];
+    if (current == null) {
+      current = new int[1];
+    } else {
+      int[] newArray = new int[current.length + 1];
+      System.arraycopy(current, 0, newArray, 0, current.length);
+      current = newArray;
+    }
+    targetMap[sourceId] = current;
+    
+    int[] targets = targetMap[sourceId];
+    targets[targets.length - 1] = wordId;
+  }
+  
+  public int[] lookupWordIds(int sourceId) {
+    return targetMap[sourceId];
+  }
+  
+  @Override	
+  public int getLeftId(int wordId) {
+    return buffer.getShort(wordId);
+  }
+  
+  @Override
+  public int getRightId(int wordId) {
+    return buffer.getShort(wordId + 2);	// Skip left id
+  }
+  
+  @Override
+  public int getWordCost(int wordId) {
+    return buffer.getShort(wordId + 4);	// Skip left id and right id
+  }
+  
+  @Override
+  public String[] getAllFeaturesArray(int wordId) {
+    int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
+    char[] targetArr = new char[size];
+    int offset = wordId + 6 + 2; // offset is position where features string starts
+    for(int i = 0; i < size; i++){
+      targetArr[i] = buffer.getChar(offset + i * 2);
+    }
+    String allFeatures = new String(targetArr);
+    return allFeatures.split(INTERNAL_SEPARATOR);
+  }
+  
+  @Override
+  public String getFeature(int wordId, int... fields) {
+    String[] allFeatures = getAllFeaturesArray(wordId);
+    StringBuilder sb = new StringBuilder();
+    
+    if(fields.length == 0){ // All features
+      for(String feature : allFeatures) {
+        sb.append(CSVUtil.quoteEscape(feature)).append(",");
+      }
+    } else if(fields.length == 1) { // One feature doesn't need to escape value
+      sb.append(allFeatures[fields[0]]).append(",");			
+    } else {
+      for(int field : fields){
+        sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+      }
+    }
+    
+    return sb.deleteCharAt(sb.length() - 1).toString();
+  }
+  
+  @Override
+  public String getReading(int wordId) {
+    return getFeature(wordId, 7);
+  }
+  
+  @Override
+  public String getAllFeatures(int wordId) {
+    return getFeature(wordId);
+  }
+  
+  @Override
+  public String getPartOfSpeech(int wordId) {
+    return getFeature(wordId, 0, 1, 2, 3);
+  }
+  
+  
+  /**
+   * Write dictionary in file
+   * Dictionary format is:
+   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+   * @param filename
+   * @throws IOException
+   */
+  public void write(String directoryname) throws IOException {
+    writeDictionary(directoryname + File.separator + FILENAME);
+    writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+  }
+  
+  protected void writeTargetMap(String filename) throws IOException {
+    ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));		
+    oos.writeObject(targetMap);
+    oos.close();
+  }
+  
+  protected void writeDictionary(String filename) throws IOException {
+    FileOutputStream fos = new FileOutputStream(filename);
+    DataOutputStream dos = new DataOutputStream(fos);
+    dos.writeInt(buffer.position());
+    WritableByteChannel channel = Channels.newChannel(fos);
+    // Write Buffer
+    buffer.flip();  // set position to 0, set limit to current position
+    channel.write(buffer);
+    
+    fos.close();
+  }
+  
+  /**
+   * Read dictionary into directly allocated buffer.
+   * @return TokenInfoDictionary instance
+   * @throws IOException
+   * @throws ClassNotFoundException 
+   */
+  public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
+    TokenInfoDictionary dictionary = new TokenInfoDictionary();
+    ClassLoader loader = dictionary.getClass().getClassLoader();
+    dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+    dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+    return dictionary;
+  }
+  
+  protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
+    ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+    targetMap = (int[][]) ois.readObject();
+    is.close();
+  }
+  
+  protected void loadDictionary(InputStream is) throws IOException {
+    DataInputStream dis = new DataInputStream(is);
+    int size = dis.readInt();
+    
+    ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+    
+    ReadableByteChannel channel = Channels.newChannel(is);
+    channel.read(tmpBuffer);
+    is.close();
+    buffer = tmpBuffer.asReadOnlyBuffer();
+  }
+  
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Tue Jan  3 04:33:56 2012
@@ -29,114 +29,114 @@ import java.io.ObjectOutputStream;
 import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition.CharacterClass;
 
 public class UnknownDictionary extends TokenInfoDictionary {
-
-	public static final String FILENAME = "unk.dat";
-	
-	public static final String TARGETMAP_FILENAME = "unk_map.dat";
-
-	public static final String CHARDEF_FILENAME = "cd.dat";
-
-	private CharacterDefinition characterDefinition;
-	
-	/**
-	 * Constructor
-	 */
-    public UnknownDictionary() {
-    }
+  
+  public static final String FILENAME = "unk.dat";
+  
+  public static final String TARGETMAP_FILENAME = "unk_map.dat";
+  
+  public static final String CHARDEF_FILENAME = "cd.dat";
+  
+  private CharacterDefinition characterDefinition;
+  
+  /**
+   * Constructor
+   */
+  public UnknownDictionary() {
+  }
+  
+  public UnknownDictionary(int size) {
+    super(size);
+    characterDefinition = new CharacterDefinition();    	
+  }
+  
+  @Override
+  public int put(String[] entry) {
+    // Get wordId of current entry
+    int wordId = buffer.position();
     
-    public UnknownDictionary(int size) {
-    	super(size);
-		characterDefinition = new CharacterDefinition();    	
-    }
+    // Put entry
+    int result = super.put(entry);
     
-    @Override
-    public int put(String[] entry) {
-    	// Get wordId of current entry
-    	int wordId = buffer.position();
-    	
-    	// Put entry
-		int result = super.put(entry);
-
-		// Put entry in targetMap
-		int characterId = CharacterClass.valueOf(entry[0]).getId();
-		addMapping(characterId, wordId);
-		return result;
+    // Put entry in targetMap
+    int characterId = CharacterClass.valueOf(entry[0]).getId();
+    addMapping(characterId, wordId);
+    return result;
+  }
+  
+  public int lookup(String text) {
+    if(!characterDefinition.isGroup(text.charAt(0))) {
+      return 1;
     }
     
-    public int lookup(String text) {
-    	if(!characterDefinition.isGroup(text.charAt(0))) {
-    		return 1;
-    	}
-    	
-    	// Extract unknown word. Characters with the same character class are considered to be part of unknown word
-    	int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
-    	int length = 1;
-    	for (int i = 1; i < text.length(); i++) {
-    		if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
-        		length++;    			
-    		} else {
-    			break;
-    		}
-    	}
-    	
-    	return length;
+    // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+    int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
+    int length = 1;
+    for (int i = 1; i < text.length(); i++) {
+      if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
+        length++;    			
+      } else {
+        break;
+      }
     }
-
-	/**
-	 * Put mapping from unicode code point to character class.
-	 * 
-	 * @param codePoint code point
-	 * @param class character class name
-	 */
-	public void putCharacterCategory(int codePoint, String characterClassName) {
-		characterDefinition.putCharacterCategory(codePoint, characterClassName);
-	}
-	
-	public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
-		characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
-	}
-	
-
-	public CharacterDefinition getCharacterDefinition() {
-		return characterDefinition;
-	}
-	
-	/**
-	 * Write dictionary in file
-	 * Dictionary format is:
-	 * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
-	 * @param filename
-	 * @throws IOException
-	 */
-	public void write(String directoryname) throws IOException {
-		writeDictionary(directoryname + File.separator + FILENAME);
-		writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
-		writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
-	}
-	
-	protected void writeCharDef(String filename) throws IOException {
-		ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));		
-		oos.writeObject(characterDefinition);
-		oos.close();
-	}
-
-	public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
-		UnknownDictionary dictionary = new UnknownDictionary();
-		ClassLoader loader = dictionary.getClass().getClassLoader();
-		dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
-		dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
-		dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));
-		return dictionary;
-	}
-
-	protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
-		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
-		characterDefinition = (CharacterDefinition) ois.readObject();
-		ois.close();
-	}
-	
-	@Override
-	public String getReading(int wordId) {
-		return null;
-	}
+    
+    return length;
+  }
+  
+  /**
+   * Put mapping from unicode code point to character class.
+   * 
+   * @param codePoint code point
+   * @param class character class name
+   */
+  public void putCharacterCategory(int codePoint, String characterClassName) {
+    characterDefinition.putCharacterCategory(codePoint, characterClassName);
+  }
+  
+  public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+    characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
+  }
+  
+  
+  public CharacterDefinition getCharacterDefinition() {
+    return characterDefinition;
+  }
+  
+  /**
+   * Write dictionary in file
+   * Dictionary format is:
+   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+   * @param filename
+   * @throws IOException
+   */
+  public void write(String directoryname) throws IOException {
+    writeDictionary(directoryname + File.separator + FILENAME);
+    writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+    writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
+  }
+  
+  protected void writeCharDef(String filename) throws IOException {
+    ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));		
+    oos.writeObject(characterDefinition);
+    oos.close();
+  }
+  
+  public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
+    UnknownDictionary dictionary = new UnknownDictionary();
+    ClassLoader loader = dictionary.getClass().getClassLoader();
+    dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+    dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+    dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));
+    return dictionary;
+  }
+  
+  protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
+    ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+    characterDefinition = (CharacterDefinition) ois.readObject();
+    ois.close();
+  }
+  
+  @Override
+  public String getReading(int wordId) {
+    return null;
+  }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Tue Jan  3 04:33:56 2012
@@ -30,167 +30,167 @@ import java.util.TreeMap;
 import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
 
 public class UserDictionary implements Dictionary {
-
-	private TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
-	
-	private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>();
-	
-	private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
-
-	public static final int WORD_COST = -100000;
-
-	public static final int LEFT_ID = 5;
-
-	public static final int RIGHT_ID = 5;
-	
-	public UserDictionary() {
-		
-	}
-
-	/**
-	 * Lookup words in text
-	 * @param text
-	 * @return array of {wordId, position, length}
-	 */
-	public int[][] lookup(String text) {
-		TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
-
-		for (String keyword : entries.descendingKeySet()) {
-			int offset = 0;
-			int position = text.indexOf(keyword, offset);
-			while (offset < text.length() && position >= 0) {
-				if(!result.containsKey(position)){
-					result.put(position, entries.get(keyword));
-				}
-				offset += position + keyword.length();
-				position = text.indexOf(keyword, offset);
-			}
-		}
-
-		return toIndexArray(result);
-	}
-
-	/**
-	 * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
-	 * @param input
-	 * @return array of {wordId, index, length}
-	 */
-	private int[][] toIndexArray(Map<Integer, int[]> input) {
-		ArrayList<int[]> result = new ArrayList<int[]>();
-		for (int i : input.keySet()) {
-			int[] wordIdAndLength = input.get(i);
-			int wordId = wordIdAndLength[0];
-			// convert length to index
-			int current = i;
-			for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
-				int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
-				result.add(token);
-				current += wordIdAndLength[j];
-			}
-		}
-		return result.toArray(new int[result.size()][]);
-	}
-
-	@Override
-	public int getLeftId(int wordId) {
-		return LEFT_ID;
-	}
-
-	@Override
-	public int getRightId(int wordId) {
-		return RIGHT_ID;
-	}
-
-	@Override
-	public int getWordCost(int wordId) {
-		return WORD_COST;
-	}
-
-	@Override
-	public String getReading(int wordId) {
-		return getFeature(wordId, 0);
-	}
-
-	@Override
-	public String getPartOfSpeech(int wordId) {
-		return getFeature(wordId, 1);
-	}
-
-	@Override
-	public String getAllFeatures(int wordId) {
-		return getFeature(wordId);
-	}
-
-	@Override
-	public String[] getAllFeaturesArray(int wordId) {
-		String allFeatures = featureEntries.get(wordId);
-		if(allFeatures == null) {
-			return null;
-		}
-		
-		return allFeatures.split(INTERNAL_SEPARATOR);		
-	}
-
-	
-	@Override
-	public String getFeature(int wordId, int... fields) {
-		String[] allFeatures = getAllFeaturesArray(wordId);
-		if (allFeatures == null) {
-			return null;
-		}
-		StringBuilder sb = new StringBuilder();
-		if (fields.length == 0) { // All features
-			for (String feature : allFeatures) {
-				sb.append(CSVUtil.quoteEscape(feature)).append(",");
-			}
-		} else if (fields.length == 1) { // One feature doesn't need to escape value
-			sb.append(allFeatures[fields[0]]).append(",");			
-		} else {
-			for (int field : fields){
-				sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
-			}
-		}
-		return sb.deleteCharAt(sb.length() - 1).toString();
-	}
-
-	public static UserDictionary read(String filename) throws IOException {
-		return read(new FileInputStream(filename));
-	}
-
-	public static UserDictionary read(InputStream is) throws IOException {
-		UserDictionary dictionary = new UserDictionary();
-		BufferedReader reader = new BufferedReader(new InputStreamReader(is));
-		String line = null;
-		int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
-		while ((line = reader.readLine()) != null) {
-			// Remove comments
-			line = line.replaceAll("#.*$", "");
-
-			// Skip empty lines or comment lines
-			if (line.trim().length() == 0) {
-				continue;
-			}
-			String[] values = CSVUtil.parse(line);
-			String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
-			String[] readings = values[2].replaceAll("  *", " ").split(" ");
-			String pos = values[3];
-
-			if (segmentation.length != readings.length) {
-				// FIXME: Should probably deal with this differently.  Exception?
-				System.out.println("This entry is not properly formatted : " + line);
-			}
-
-			int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
-			wordIdAndLength[0] = wordId;
-			for (int i = 0; i < segmentation.length; i++) {
-				wordIdAndLength[i + 1] = segmentation[i].length();
-				dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos);
-				wordId++;
-			}
-			dictionary.entries.put(values[0], wordIdAndLength);
-		}
-		reader.close();
-		return dictionary;
-	}
-
+  
+  private TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
+  
+  private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>();
+  
+  private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
+  
+  public static final int WORD_COST = -100000;
+  
+  public static final int LEFT_ID = 5;
+  
+  public static final int RIGHT_ID = 5;
+  
+  public UserDictionary() {
+    
+  }
+  
+  /**
+   * Lookup words in text
+   * @param text
+   * @return array of {wordId, position, length}
+   */
+  public int[][] lookup(String text) {
+    TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
+    
+    for (String keyword : entries.descendingKeySet()) {
+      int offset = 0;
+      int position = text.indexOf(keyword, offset);
+      while (offset < text.length() && position >= 0) {
+        if(!result.containsKey(position)){
+          result.put(position, entries.get(keyword));
+        }
+        offset += position + keyword.length();
+        position = text.indexOf(keyword, offset);
+      }
+    }
+    
+    return toIndexArray(result);
+  }
+  
+  /**
+   * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
+   * @param input
+   * @return array of {wordId, index, length}
+   */
+  private int[][] toIndexArray(Map<Integer, int[]> input) {
+    ArrayList<int[]> result = new ArrayList<int[]>();
+    for (int i : input.keySet()) {
+      int[] wordIdAndLength = input.get(i);
+      int wordId = wordIdAndLength[0];
+      // convert length to index
+      int current = i;
+      for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
+        int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
+        result.add(token);
+        current += wordIdAndLength[j];
+      }
+    }
+    return result.toArray(new int[result.size()][]);
+  }
+  
+  @Override
+  public int getLeftId(int wordId) {
+    return LEFT_ID;
+  }
+  
+  @Override
+  public int getRightId(int wordId) {
+    return RIGHT_ID;
+  }
+  
+  @Override
+  public int getWordCost(int wordId) {
+    return WORD_COST;
+  }
+  
+  @Override
+  public String getReading(int wordId) {
+    return getFeature(wordId, 0);
+  }
+  
+  @Override
+  public String getPartOfSpeech(int wordId) {
+    return getFeature(wordId, 1);
+  }
+  
+  @Override
+  public String getAllFeatures(int wordId) {
+    return getFeature(wordId);
+  }
+  
+  @Override
+  public String[] getAllFeaturesArray(int wordId) {
+    String allFeatures = featureEntries.get(wordId);
+    if(allFeatures == null) {
+      return null;
+    }
+    
+    return allFeatures.split(INTERNAL_SEPARATOR);		
+  }
+  
+  
+  @Override
+  public String getFeature(int wordId, int... fields) {
+    String[] allFeatures = getAllFeaturesArray(wordId);
+    if (allFeatures == null) {
+      return null;
+    }
+    StringBuilder sb = new StringBuilder();
+    if (fields.length == 0) { // All features
+      for (String feature : allFeatures) {
+        sb.append(CSVUtil.quoteEscape(feature)).append(",");
+      }
+    } else if (fields.length == 1) { // One feature doesn't need to escape value
+      sb.append(allFeatures[fields[0]]).append(",");			
+    } else {
+      for (int field : fields){
+        sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+      }
+    }
+    return sb.deleteCharAt(sb.length() - 1).toString();
+  }
+  
+  public static UserDictionary read(String filename) throws IOException {
+    return read(new FileInputStream(filename));
+  }
+  
+  public static UserDictionary read(InputStream is) throws IOException {
+    UserDictionary dictionary = new UserDictionary();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+    String line = null;
+    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
+    while ((line = reader.readLine()) != null) {
+      // Remove comments
+      line = line.replaceAll("#.*$", "");
+      
+      // Skip empty lines or comment lines
+      if (line.trim().length() == 0) {
+        continue;
+      }
+      String[] values = CSVUtil.parse(line);
+      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
+      String[] readings = values[2].replaceAll("  *", " ").split(" ");
+      String pos = values[3];
+      
+      if (segmentation.length != readings.length) {
+        // FIXME: Should probably deal with this differently.  Exception?
+        System.out.println("This entry is not properly formatted : " + line);
+      }
+      
+      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
+      wordIdAndLength[0] = wordId;
+      for (int i = 0; i < segmentation.length; i++) {
+        wordIdAndLength[i + 1] = segmentation[i].length();
+        dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos);
+        wordId++;
+      }
+      dictionary.entries.put(values[0], wordIdAndLength);
+    }
+    reader.close();
+    return dictionary;
+  }
+  
 }