You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:33:57 UTC
svn commit: r1226637 [1/3] - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucen...
Author: rmuir
Date: Tue Jan 3 04:33:56 2012
New Revision: 1226637
URL: http://svn.apache.org/viewvc?rev=1226637&view=rev
Log:
LUCENE-3305: indent 2 spaces
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/Trie.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/CSVUtil.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/DoubleArrayTrieBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/DebugTokenizer.java Tue Jan 3 04:33:56 2012
@@ -32,58 +32,58 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
public class DebugTokenizer {
-
- private GraphvizFormatter formatter;
-
- private Viterbi viterbi;
-
- protected DebugTokenizer(UserDictionary userDictionary, Mode mode) {
-
- this.viterbi = new Viterbi(Dictionaries.getTrie(),
- Dictionaries.getDictionary(),
- Dictionaries.getUnknownDictionary(),
- Dictionaries.getCosts(),
- userDictionary,
- mode);
-
- this.formatter = new GraphvizFormatter(Dictionaries.getCosts());
- }
-
- public String debugTokenize(String text) {
- ViterbiNode[][][] lattice = this.viterbi.build(text);
- List<ViterbiNode> bestPath = this.viterbi.search(lattice);
- return this.formatter.format(lattice[0], lattice[1], bestPath);
- }
-
- public static Builder builder() {
- return new Builder();
- }
-
- public static class Builder {
-
- private Mode mode = Mode.NORMAL;
-
- private UserDictionary userDictionary = null;
-
- public synchronized Builder mode(Mode mode) {
- this.mode = mode;
- return this;
- }
-
- public synchronized Builder userDictionary(InputStream userDictionaryInputStream)
- throws IOException {
- this.userDictionary = UserDictionary.read(userDictionaryInputStream);
- return this;
- }
-
- public synchronized Builder userDictionary(String userDictionaryPath)
- throws FileNotFoundException, IOException {
- this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
- return this;
- }
-
- public synchronized DebugTokenizer build() {
- return new DebugTokenizer(userDictionary, mode);
- }
- }
+
+ private GraphvizFormatter formatter;
+
+ private Viterbi viterbi;
+
+ protected DebugTokenizer(UserDictionary userDictionary, Mode mode) {
+
+ this.viterbi = new Viterbi(Dictionaries.getTrie(),
+ Dictionaries.getDictionary(),
+ Dictionaries.getUnknownDictionary(),
+ Dictionaries.getCosts(),
+ userDictionary,
+ mode);
+
+ this.formatter = new GraphvizFormatter(Dictionaries.getCosts());
+ }
+
+ public String debugTokenize(String text) {
+ ViterbiNode[][][] lattice = this.viterbi.build(text);
+ List<ViterbiNode> bestPath = this.viterbi.search(lattice);
+ return this.formatter.format(lattice[0], lattice[1], bestPath);
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+
+ private Mode mode = Mode.NORMAL;
+
+ private UserDictionary userDictionary = null;
+
+ public synchronized Builder mode(Mode mode) {
+ this.mode = mode;
+ return this;
+ }
+
+ public synchronized Builder userDictionary(InputStream userDictionaryInputStream)
+ throws IOException {
+ this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+ return this;
+ }
+
+ public synchronized Builder userDictionary(String userDictionaryPath)
+ throws FileNotFoundException, IOException {
+ this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+ return this;
+ }
+
+ public synchronized DebugTokenizer build() {
+ return new DebugTokenizer(userDictionary, mode);
+ }
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Tue Jan 3 04:33:56 2012
@@ -21,90 +21,90 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class Token {
- private final Dictionary dictionary;
-
- private final int wordId;
-
- private final String surfaceForm;
-
- private final int position;
-
- private final Type type;
-
- public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
- this.wordId = wordId;
- this.surfaceForm = surfaceForm;
- this.type = type;
- this.position = position;
- this.dictionary = dictionary;
- }
-
- /**
- * @return surfaceForm
- */
- public String getSurfaceForm() {
- return surfaceForm;
- }
-
- /**
- * @return all features
- */
- public String getAllFeatures() {
- return dictionary.getAllFeatures(wordId);
- }
-
- /**
- * @return all features as array
- */
- public String[] getAllFeaturesArray() {
- return dictionary.getAllFeaturesArray(wordId);
- }
-
-
- /**
- * @return reading. null if token doesn't have reading.
- */
- public String getReading() {
- return dictionary.getReading(wordId);
- }
-
- /**
- * @return part of speech.
- */
- public String getPartOfSpeech() {
- return dictionary.getPartOfSpeech(wordId);
- }
-
- /**
- * Returns true if this token is known word
- * @return true if this token is in standard dictionary. false if not.
- */
- public boolean isKnown() {
- return type == Type.KNOWN;
- }
-
- /**
- * Returns true if this token is unknown word
- * @return true if this token is unknown word. false if not.
- */
- public boolean isUnknown() {
- return type == Type.UNKNOWN;
- }
-
- /**
- * Returns true if this token is defined in user dictionary
- * @return true if this token is in user dictionary. false if not.
- */
- public boolean isUser() {
- return type == Type.USER;
- }
-
- /**
- * Get index of this token in input text
- * @return position of token
- */
- public int getPosition() {
- return position;
- }
-
+ private final Dictionary dictionary;
+
+ private final int wordId;
+
+ private final String surfaceForm;
+
+ private final int position;
+
+ private final Type type;
+
+ public Token(int wordId, String surfaceForm, Type type, int position, Dictionary dictionary) {
+ this.wordId = wordId;
+ this.surfaceForm = surfaceForm;
+ this.type = type;
+ this.position = position;
+ this.dictionary = dictionary;
+ }
+
+ /**
+ * @return surfaceForm
+ */
+ public String getSurfaceForm() {
+ return surfaceForm;
+ }
+
+ /**
+ * @return all features
+ */
+ public String getAllFeatures() {
+ return dictionary.getAllFeatures(wordId);
+ }
+
+ /**
+ * @return all features as array
+ */
+ public String[] getAllFeaturesArray() {
+ return dictionary.getAllFeaturesArray(wordId);
+ }
+
+
+ /**
+ * @return reading. null if token doesn't have reading.
+ */
+ public String getReading() {
+ return dictionary.getReading(wordId);
+ }
+
+ /**
+ * @return part of speech.
+ */
+ public String getPartOfSpeech() {
+ return dictionary.getPartOfSpeech(wordId);
+ }
+
+ /**
+ * Returns true if this token is known word
+ * @return true if this token is in standard dictionary. false if not.
+ */
+ public boolean isKnown() {
+ return type == Type.KNOWN;
+ }
+
+ /**
+ * Returns true if this token is unknown word
+ * @return true if this token is unknown word. false if not.
+ */
+ public boolean isUnknown() {
+ return type == Type.UNKNOWN;
+ }
+
+ /**
+ * Returns true if this token is defined in user dictionary
+ * @return true if this token is in user dictionary. false if not.
+ */
+ public boolean isUser() {
+ return type == Type.USER;
+ }
+
+ /**
+ * Get index of this token in input text
+ * @return position of token
+ */
+ public int getPosition() {
+ return position;
+ }
+
}
\ No newline at end of file
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Tokenizer.java Tue Jan 3 04:33:56 2012
@@ -38,201 +38,201 @@ import org.apache.lucene.analysis.kuromo
* Thread safe.
*/
public class Tokenizer {
- public enum Mode {
- NORMAL, SEARCH, EXTENDED
- }
-
- private final Viterbi viterbi;
-
- private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
-
- private final boolean split;
-
- /**
- * Constructor
- * @param dictionary
- * @param costs
- * @param trie
- * @param unkDictionary
- * @param userDictionary
- * @param mode
- */
- protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
-
- this.viterbi = new Viterbi(Dictionaries.getTrie(),
- Dictionaries.getDictionary(),
- Dictionaries.getUnknownDictionary(),
- Dictionaries.getCosts(),
- userDictionary,
- mode);
-
- this.split = split;
-
- dictionaryMap.put(Type.KNOWN, Dictionaries.getDictionary());
- dictionaryMap.put(Type.UNKNOWN, Dictionaries.getUnknownDictionary());
- dictionaryMap.put(Type.USER, userDictionary);
- }
-
- /**
- * Tokenize input text
- * @param text
- * @return list of Token
- */
- public List<Token> tokenize(String text) {
-
- if (!split) {
- return doTokenize(0, text);
- }
-
- List<Integer> splitPositions = getSplitPositions(text);
-
- if(splitPositions.size() == 0) {
- return doTokenize(0, text);
- }
-
- ArrayList<Token> result = new ArrayList<Token>();
- int offset = 0;
- for(int position : splitPositions) {
- result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
- offset = position + 1;
- }
-
- if(offset < text.length()) {
- result.addAll(doTokenize(offset, text.substring(offset)));
- }
-
- return result;
- }
-
- /**
- * Split input text at å¥èªç¹, which is ã and ã
- * @param text
- * @return list of split position
- */
- private List<Integer> getSplitPositions(String text) {
- ArrayList<Integer> splitPositions = new ArrayList<Integer>();
-
- int position = 0;
- int currentPosition = 0;
-
- while(true) {
- int indexOfMaru = text.indexOf("ã", currentPosition);
- int indexOfTen = text.indexOf("ã", currentPosition);
-
- if(indexOfMaru < 0 || indexOfTen < 0) {
- position = Math.max(indexOfMaru, indexOfTen);;
- } else {
- position = Math.min(indexOfMaru, indexOfTen);
- }
-
- if(position >= 0) {
- splitPositions.add(position);
- currentPosition = position + 1;
- } else {
- break;
- }
- }
-
- return splitPositions;
- }
-
- /**
- * Tokenize input sentence.
- * @param offset offset of sentence in original input text
- * @param sentence sentence to tokenize
- * @return list of Token
- */
- private List<Token> doTokenize(int offset, String sentence) {
- ArrayList<Token> result = new ArrayList<Token>();
-
- ViterbiNode[][][] lattice = viterbi.build(sentence);
- List<ViterbiNode> bestPath = viterbi.search(lattice);
- for (ViterbiNode node : bestPath) {
- int wordId = node.getWordId();
- if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
- continue;
- }
- Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
- result.add(token);
- }
-
- return result;
- }
-
- /**
- * Get Builder to create Tokenizer instance.
- * @return Builder
- */
- public static Builder builder() {
- return new Builder();
- }
-
- /**
- * Builder class used to create Tokenizer instance.
- */
- public static class Builder {
-
- private Mode mode = Mode.NORMAL;
-
- private boolean split = true;
-
- private UserDictionary userDictionary = null;
-
- /**
- * Set tokenization mode
- * Default: NORMAL
- * @param mode tokenization mode
- * @return Builder
- */
- public synchronized Builder mode(Mode mode) {
- this.mode = mode;
- return this;
- }
-
- /**
- * Set if tokenizer should split input string at "ã" and "ã" before tokenize to increase performance.
- * Splitting shouldn't change the result of tokenization most of the cases.
- * Default: true
- *
- * @param split whether tokenizer should split input string
- * @return Builder
- */
- public synchronized Builder split(boolean split) {
- this.split = split;
- return this;
- }
-
- /**
- * Set user dictionary input stream
- * @param userDictionaryInputStream dictionary file as input stream
- * @return Builder
- * @throws IOException
- */
- public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
- this.userDictionary = UserDictionary.read(userDictionaryInputStream);
- return this;
- }
-
- /**
- * Set user dictionary path
- * @param userDictionaryPath path to dictionary file
- * @return Builder
- * @throws IOException
- * @throws FileNotFoundException
- */
- public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
- if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
- this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
- }
- return this;
- }
-
- /**
- * Create Tokenizer instance
- * @return Tokenizer
- */
- public synchronized Tokenizer build() {
- return new Tokenizer(userDictionary, mode, split);
- }
- }
+ public enum Mode {
+ NORMAL, SEARCH, EXTENDED
+ }
+
+ private final Viterbi viterbi;
+
+ private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
+
+ private final boolean split;
+
+ /**
+ * Constructor
+ * @param dictionary
+ * @param costs
+ * @param trie
+ * @param unkDictionary
+ * @param userDictionary
+ * @param mode
+ */
+ protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
+
+ this.viterbi = new Viterbi(Dictionaries.getTrie(),
+ Dictionaries.getDictionary(),
+ Dictionaries.getUnknownDictionary(),
+ Dictionaries.getCosts(),
+ userDictionary,
+ mode);
+
+ this.split = split;
+
+ dictionaryMap.put(Type.KNOWN, Dictionaries.getDictionary());
+ dictionaryMap.put(Type.UNKNOWN, Dictionaries.getUnknownDictionary());
+ dictionaryMap.put(Type.USER, userDictionary);
+ }
+
+ /**
+ * Tokenize input text
+ * @param text
+ * @return list of Token
+ */
+ public List<Token> tokenize(String text) {
+
+ if (!split) {
+ return doTokenize(0, text);
+ }
+
+ List<Integer> splitPositions = getSplitPositions(text);
+
+ if(splitPositions.size() == 0) {
+ return doTokenize(0, text);
+ }
+
+ ArrayList<Token> result = new ArrayList<Token>();
+ int offset = 0;
+ for(int position : splitPositions) {
+ result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
+ offset = position + 1;
+ }
+
+ if(offset < text.length()) {
+ result.addAll(doTokenize(offset, text.substring(offset)));
+ }
+
+ return result;
+ }
+
+ /**
+ * Split input text at å¥èªç¹, which is ã and ã
+ * @param text
+ * @return list of split position
+ */
+ private List<Integer> getSplitPositions(String text) {
+ ArrayList<Integer> splitPositions = new ArrayList<Integer>();
+
+ int position = 0;
+ int currentPosition = 0;
+
+ while(true) {
+ int indexOfMaru = text.indexOf("ã", currentPosition);
+ int indexOfTen = text.indexOf("ã", currentPosition);
+
+ if(indexOfMaru < 0 || indexOfTen < 0) {
+ position = Math.max(indexOfMaru, indexOfTen);;
+ } else {
+ position = Math.min(indexOfMaru, indexOfTen);
+ }
+
+ if(position >= 0) {
+ splitPositions.add(position);
+ currentPosition = position + 1;
+ } else {
+ break;
+ }
+ }
+
+ return splitPositions;
+ }
+
+ /**
+ * Tokenize input sentence.
+ * @param offset offset of sentence in original input text
+ * @param sentence sentence to tokenize
+ * @return list of Token
+ */
+ private List<Token> doTokenize(int offset, String sentence) {
+ ArrayList<Token> result = new ArrayList<Token>();
+
+ ViterbiNode[][][] lattice = viterbi.build(sentence);
+ List<ViterbiNode> bestPath = viterbi.search(lattice);
+ for (ViterbiNode node : bestPath) {
+ int wordId = node.getWordId();
+ if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
+ continue;
+ }
+ Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
+ result.add(token);
+ }
+
+ return result;
+ }
+
+ /**
+ * Get Builder to create Tokenizer instance.
+ * @return Builder
+ */
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ /**
+ * Builder class used to create Tokenizer instance.
+ */
+ public static class Builder {
+
+ private Mode mode = Mode.NORMAL;
+
+ private boolean split = true;
+
+ private UserDictionary userDictionary = null;
+
+ /**
+ * Set tokenization mode
+ * Default: NORMAL
+ * @param mode tokenization mode
+ * @return Builder
+ */
+ public synchronized Builder mode(Mode mode) {
+ this.mode = mode;
+ return this;
+ }
+
+ /**
+ * Set if tokenizer should split input string at "ã" and "ã" before tokenize to increase performance.
+ * Splitting shouldn't change the result of tokenization most of the cases.
+ * Default: true
+ *
+ * @param split whether tokenizer should split input string
+ * @return Builder
+ */
+ public synchronized Builder split(boolean split) {
+ this.split = split;
+ return this;
+ }
+
+ /**
+ * Set user dictionary input stream
+ * @param userDictionaryInputStream dictionary file as input stream
+ * @return Builder
+ * @throws IOException
+ */
+ public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
+ this.userDictionary = UserDictionary.read(userDictionaryInputStream);
+ return this;
+ }
+
+ /**
+ * Set user dictionary path
+ * @param userDictionaryPath path to dictionary file
+ * @return Builder
+ * @throws IOException
+ * @throws FileNotFoundException
+ */
+ public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
+ if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
+ this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
+ }
+ return this;
+ }
+
+ /**
+ * Create Tokenizer instance
+ * @return Tokenizer
+ */
+ public synchronized Tokenizer build() {
+ return new Tokenizer(userDictionary, mode, split);
+ }
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java Tue Jan 3 04:33:56 2012
@@ -21,78 +21,78 @@ import java.io.Serializable;
import java.util.EnumMap;
public final class CharacterDefinition implements Serializable {
- private static final long serialVersionUID = -1436753619176638532L;
-
- private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
-
- private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
- new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
-
- public enum CharacterClass {
- NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
-
- public int getId() {
- return ordinal();
- }
- }
-
- /**
- * Constructor
- */
- public CharacterDefinition() {
- for (int i = 0; i < characterCategoryMap.length; i++) {
- characterCategoryMap[i] = CharacterClass.DEFAULT;
- }
- }
-
- public int lookup(char c) {
- return characterCategoryMap[c].getId();
- }
-
- public CharacterClass getCharacterClass(char c) {
- return characterCategoryMap[c];
- }
-
- public boolean isInvoke(char c) {
- CharacterClass characterClass = characterCategoryMap[c];
- int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
- return invokeDefinition[0] == 1;
- }
-
- public boolean isGroup(char c) {
- CharacterClass characterClass = characterCategoryMap[c];
- int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
- return invokeDefinition[1] == 1;
- }
-
- public boolean isKanji(char c) {
- return characterCategoryMap[c] == CharacterClass.KANJI ||
- characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
- }
-
- /**
- * Put mapping from unicode code point to character class.
- *
- * @param codePoint
- * code point
- * @param class character class name
- */
- public void putCharacterCategory(int codePoint, String characterClassName) {
- characterClassName = characterClassName.split(" ")[0]; // use first
- // category
- // class
-
- // Override Nakaguro
- if (codePoint == 0x30FB) {
- characterClassName = "SYMBOL";
- }
- characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
- }
-
- public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
- CharacterClass characterClass = CharacterClass
- .valueOf(characterClassName);
- int[] values = { invoke, group, length };
- invokeDefinitionMap.put(characterClass, values);
- }
+ private static final long serialVersionUID = -1436753619176638532L;
+
+ private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
+
+ private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
+ new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
+
+ public enum CharacterClass {
+ NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
+
+ public int getId() {
+ return ordinal();
+ }
+ }
+
+ /**
+ * Constructor
+ */
+ public CharacterDefinition() {
+ for (int i = 0; i < characterCategoryMap.length; i++) {
+ characterCategoryMap[i] = CharacterClass.DEFAULT;
+ }
+ }
+
+ public int lookup(char c) {
+ return characterCategoryMap[c].getId();
+ }
+
+ public CharacterClass getCharacterClass(char c) {
+ return characterCategoryMap[c];
+ }
+
+ public boolean isInvoke(char c) {
+ CharacterClass characterClass = characterCategoryMap[c];
+ int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+ return invokeDefinition[0] == 1;
+ }
+
+ public boolean isGroup(char c) {
+ CharacterClass characterClass = characterCategoryMap[c];
+ int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
+ return invokeDefinition[1] == 1;
+ }
+
+ public boolean isKanji(char c) {
+ return characterCategoryMap[c] == CharacterClass.KANJI ||
+ characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
+ }
+
+ /**
+ * Put mapping from unicode code point to character class.
+ *
+ * @param codePoint
+ * code point
+ * @param class character class name
+ */
+ public void putCharacterCategory(int codePoint, String characterClassName) {
+ characterClassName = characterClassName.split(" ")[0]; // use first
+ // category
+ // class
+
+ // Override Nakaguro
+ if (codePoint == 0x30FB) {
+ characterClassName = "SYMBOL";
+ }
+ characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
+ }
+
+ public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+ CharacterClass characterClass = CharacterClass
+ .valueOf(characterClassName);
+ int[] values = { invoke, group, length };
+ invokeDefinitionMap.put(characterClass, values);
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java Tue Jan 3 04:33:56 2012
@@ -28,53 +28,53 @@ import java.io.ObjectOutputStream;
import java.io.Serializable;
public class ConnectionCosts implements Serializable{
-
- private static final long serialVersionUID = -7704592689635266457L;
-
- public static final String FILENAME = "cc.dat";
-
- private short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
-
- public ConnectionCosts() {
-
- }
-
- public ConnectionCosts(int forwardSize, int backwardSize) {
- this.costs = new short[backwardSize][forwardSize];
- }
-
- public void add(int forwardId, int backwardId, int cost) {
- this.costs[backwardId][forwardId] = (short)cost;
- }
-
- public int get(int forwardId, int backwardId) {
- // FIXME: There seems to be something wrong with the double array trie in some rare
- // cases causing and IndexOutOfBoundsException. Use a guard as a temporary work-around
- // and return a high cost to advise Mr. Viterbi strongly to not use this transition
- if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
- return costs[backwardId][forwardId];
- } else {
- return 50000;
- }
- }
-
- public void write(String directoryname) throws IOException {
- String filename = directoryname + File.separator + FILENAME;
- ObjectOutputStream outputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
- outputStream.writeObject(this);
- outputStream.close();
- }
-
- public static ConnectionCosts getInstance() throws IOException, ClassNotFoundException {
- InputStream is = ConnectionCosts.class.getClassLoader().getResourceAsStream(FILENAME);
- return read(is);
- }
-
- public static ConnectionCosts read(InputStream is) throws IOException, ClassNotFoundException {
- ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
- ConnectionCosts instance = (ConnectionCosts) ois.readObject();
- ois.close();
- return instance;
- }
-
+
+ private static final long serialVersionUID = -7704592689635266457L;
+
+ public static final String FILENAME = "cc.dat";
+
+ private short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
+
+ public ConnectionCosts() {
+
+ }
+
+ public ConnectionCosts(int forwardSize, int backwardSize) {
+ this.costs = new short[backwardSize][forwardSize];
+ }
+
+ public void add(int forwardId, int backwardId, int cost) {
+ this.costs[backwardId][forwardId] = (short)cost;
+ }
+
+ public int get(int forwardId, int backwardId) {
+ // FIXME: There seems to be something wrong with the double array trie in some rare
+ // cases causing and IndexOutOfBoundsException. Use a guard as a temporary work-around
+ // and return a high cost to advise Mr. Viterbi strongly to not use this transition
+ if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
+ return costs[backwardId][forwardId];
+ } else {
+ return 50000;
+ }
+ }
+
+ public void write(String directoryname) throws IOException {
+ String filename = directoryname + File.separator + FILENAME;
+ ObjectOutputStream outputStream = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+ outputStream.writeObject(this);
+ outputStream.close();
+ }
+
+ public static ConnectionCosts getInstance() throws IOException, ClassNotFoundException {
+ InputStream is = ConnectionCosts.class.getClassLoader().getResourceAsStream(FILENAME);
+ return read(is);
+ }
+
+ public static ConnectionCosts read(InputStream is) throws IOException, ClassNotFoundException {
+ ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+ ConnectionCosts instance = (ConnectionCosts) ois.readObject();
+ ois.close();
+ return instance;
+ }
+
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionaries.java Tue Jan 3 04:33:56 2012
@@ -20,91 +20,91 @@ package org.apache.lucene.analysis.kurom
import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
public final class Dictionaries {
-
- private static TokenInfoDictionary dictionary;
-
- private static UnknownDictionary unknownDictionary;
-
- private static ConnectionCosts costs;
-
- private static DoubleArrayTrie trie;
-
- private static boolean initialized = false;
-
- static {
- load();
- }
-
- private static synchronized void load() {
-
- if (Dictionaries.initialized) {
- return;
- }
-
- try {
- Dictionaries.dictionary = TokenInfoDictionary.getInstance();
- Dictionaries.unknownDictionary = UnknownDictionary.getInstance();
- Dictionaries.costs = ConnectionCosts.getInstance();
- Dictionaries.trie = DoubleArrayTrie.getInstance();
- Dictionaries.initialized = true;
- } catch (Exception ex) {
- throw new RuntimeException("Could not load dictionaries! Ouch, ouch, ouch...", ex);
- }
- }
-
- /**
- * @return the dictionary
- */
- public static TokenInfoDictionary getDictionary() {
- return dictionary;
- }
-
- /**
- * @param dictionary the dictionary to set
- */
- public static void setDictionary(TokenInfoDictionary dictionary) {
- Dictionaries.dictionary = dictionary;
- }
-
- /**
- * @return the unknownDictionary
- */
- public static UnknownDictionary getUnknownDictionary() {
- return unknownDictionary;
- }
-
- /**
- * @param unknownDictionary the unknownDictionary to set
- */
- public static void setUnknownDictionary(UnknownDictionary unknownDictionary) {
- Dictionaries.unknownDictionary = unknownDictionary;
- }
-
- /**
- * @return the costs
- */
- public static ConnectionCosts getCosts() {
- return costs;
- }
-
- /**
- * @param costs the costs to set
- */
- public static void setCosts(ConnectionCosts costs) {
- Dictionaries.costs = costs;
- }
-
- /**
- * @return the trie
- */
- public static DoubleArrayTrie getTrie() {
- return trie;
- }
-
- /**
- * @param trie the trie to set
- */
- public static void setTrie(DoubleArrayTrie trie) {
- Dictionaries.trie = trie;
- }
+
+ private static TokenInfoDictionary dictionary;
+
+ private static UnknownDictionary unknownDictionary;
+
+ private static ConnectionCosts costs;
+
+ private static DoubleArrayTrie trie;
+
+ private static boolean initialized = false;
+
+ static {
+ load();
+ }
+
+ private static synchronized void load() {
+
+ if (Dictionaries.initialized) {
+ return;
+ }
+
+ try {
+ Dictionaries.dictionary = TokenInfoDictionary.getInstance();
+ Dictionaries.unknownDictionary = UnknownDictionary.getInstance();
+ Dictionaries.costs = ConnectionCosts.getInstance();
+ Dictionaries.trie = DoubleArrayTrie.getInstance();
+ Dictionaries.initialized = true;
+ } catch (Exception ex) {
+ throw new RuntimeException("Could not load dictionaries! Ouch, ouch, ouch...", ex);
+ }
+ }
+
+ /**
+ * @return the dictionary
+ */
+ public static TokenInfoDictionary getDictionary() {
+ return dictionary;
+ }
+
+ /**
+ * @param dictionary the dictionary to set
+ */
+ public static void setDictionary(TokenInfoDictionary dictionary) {
+ Dictionaries.dictionary = dictionary;
+ }
+
+ /**
+ * @return the unknownDictionary
+ */
+ public static UnknownDictionary getUnknownDictionary() {
+ return unknownDictionary;
+ }
+
+ /**
+ * @param unknownDictionary the unknownDictionary to set
+ */
+ public static void setUnknownDictionary(UnknownDictionary unknownDictionary) {
+ Dictionaries.unknownDictionary = unknownDictionary;
+ }
+
+ /**
+ * @return the costs
+ */
+ public static ConnectionCosts getCosts() {
+ return costs;
+ }
+
+ /**
+ * @param costs the costs to set
+ */
+ public static void setCosts(ConnectionCosts costs) {
+ Dictionaries.costs = costs;
+ }
+
+ /**
+ * @return the trie
+ */
+ public static DoubleArrayTrie getTrie() {
+ return trie;
+ }
+
+ /**
+ * @param trie the trie to set
+ */
+ public static void setTrie(DoubleArrayTrie trie) {
+ Dictionaries.trie = trie;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Tue Jan 3 04:33:56 2012
@@ -18,63 +18,63 @@ package org.apache.lucene.analysis.kurom
*/
public interface Dictionary {
-
- public static final String INTERNAL_SEPARATOR = "\u0000";
-
- /**
- * Get left id of specified word
- * @param wordId
- * @return left id
- */
- public int getLeftId(int wordId);
-
- /**
- * Get right id of specified word
- * @param wordId
- * @return left id
- */
- public int getRightId(int wordId);
-
- /**
- * Get word cost of specified word
- * @param wordId
- * @return left id
- */
- public int getWordCost(int wordId);
-
- /**
- * Get all features of tokens
- * @param wordId word ID of token
- * @return All features of the token
- */
- public String getAllFeatures(int wordId);
-
- /**
- * Get all features as array
- * @param wordId word ID of token
- * @return Array containing all features of the token
- */
- public String[] getAllFeaturesArray(int wordId);
-
- /**
- * Get Part-Of-Speech of tokens
- * @param wordId word ID of token
- * @return Part-Of-Speech of the token
- */
- public String getPartOfSpeech(int wordId);
-
- /**
- * Get reading of tokens
- * @param wordId word ID of token
- * @return Reading of the token
- */
- public String getReading(int wordId);
-
- /**
- * Get feature(s) of tokens
- * @param wordId word ID token
- * @param fields array of index. If this is empty, return all features.
- * @return Features of the token
- */
- public String getFeature(int wordId, int... fields);
+
+ public static final String INTERNAL_SEPARATOR = "\u0000";
+
+ /**
+ * Get left id of specified word
+ * @param wordId
+ * @return left id
+ */
+ public int getLeftId(int wordId);
+
+ /**
+ * Get right id of specified word
+ * @param wordId
+ * @return left id
+ */
+ public int getRightId(int wordId);
+
+ /**
+ * Get word cost of specified word
+ * @param wordId
+ * @return left id
+ */
+ public int getWordCost(int wordId);
+
+ /**
+ * Get all features of tokens
+ * @param wordId word ID of token
+ * @return All features of the token
+ */
+ public String getAllFeatures(int wordId);
+
+ /**
+ * Get all features as array
+ * @param wordId word ID of token
+ * @return Array containing all features of the token
+ */
+ public String[] getAllFeaturesArray(int wordId);
+
+ /**
+ * Get Part-Of-Speech of tokens
+ * @param wordId word ID of token
+ * @return Part-Of-Speech of the token
+ */
+ public String getPartOfSpeech(int wordId);
+
+ /**
+ * Get reading of tokens
+ * @param wordId word ID of token
+ * @return Reading of the token
+ */
+ public String getReading(int wordId);
+
+ /**
+ * Get feature(s) of tokens
+ * @param wordId word ID token
+ * @param fields array of index. If this is empty, return all features.
+ * @return Features of the token
+ */
+ public String getFeature(int wordId, int... fields);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Tue Jan 3 04:33:56 2012
@@ -35,210 +35,210 @@ import java.nio.channels.WritableByteCha
import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
public class TokenInfoDictionary implements Dictionary{
-
- public static final String FILENAME = "tid.dat";
-
- public static final String TARGETMAP_FILENAME = "tid_map.dat";
-
- protected ByteBuffer buffer;
-
- protected int[][] targetMap;
-
- public TokenInfoDictionary() {
- }
-
- public TokenInfoDictionary(int size) {
- targetMap = new int[1][];
- buffer = ByteBuffer.allocate(size);
- }
-
- /**
- * put the entry in map
- * @param wordId
- * @param entry
- * @return current position of buffer, which will be wordId of next entry
- */
- public int put(String[] entry) {
- short leftId = Short.parseShort(entry[1]);
- short rightId = Short.parseShort(entry[2]);
- short wordCost = Short.parseShort(entry[3]);
-
- StringBuilder sb = new StringBuilder();
- for (int i = 4; i < entry.length; i++){
- sb.append(entry[i]).append(INTERNAL_SEPARATOR);
- }
- String features = sb.deleteCharAt(sb.length() - 1).toString();
- int featuresSize = features.length()* 2;
-
- // extend buffer if necessary
- int left = buffer.limit() - buffer.position();
- if (8 + featuresSize > left) { // four short and features
- ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
- buffer.flip();
- newBuffer.put(buffer);
- buffer = newBuffer;
- }
-
- buffer.putShort(leftId);
- buffer.putShort(rightId);
- buffer.putShort(wordCost);
- buffer.putShort((short)featuresSize);
- for (char c : features.toCharArray()){
- buffer.putChar(c);
- }
-
- return buffer.position();
- }
-
- public void addMapping(int sourceId, int wordId) {
- if(targetMap.length <= sourceId) {
- int[][] newArray = new int[sourceId + 1][];
- System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
- targetMap = newArray;
- }
-
- // Prepare array -- extend the length of array by one
- int[] current = targetMap[sourceId];
- if (current == null) {
- current = new int[1];
- } else {
- int[] newArray = new int[current.length + 1];
- System.arraycopy(current, 0, newArray, 0, current.length);
- current = newArray;
- }
- targetMap[sourceId] = current;
-
- int[] targets = targetMap[sourceId];
- targets[targets.length - 1] = wordId;
- }
-
- public int[] lookupWordIds(int sourceId) {
- return targetMap[sourceId];
- }
-
- @Override
- public int getLeftId(int wordId) {
- return buffer.getShort(wordId);
- }
-
- @Override
- public int getRightId(int wordId) {
- return buffer.getShort(wordId + 2); // Skip left id
- }
-
- @Override
- public int getWordCost(int wordId) {
- return buffer.getShort(wordId + 4); // Skip left id and right id
- }
-
- @Override
- public String[] getAllFeaturesArray(int wordId) {
- int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
- char[] targetArr = new char[size];
- int offset = wordId + 6 + 2; // offset is position where features string starts
- for(int i = 0; i < size; i++){
- targetArr[i] = buffer.getChar(offset + i * 2);
- }
- String allFeatures = new String(targetArr);
- return allFeatures.split(INTERNAL_SEPARATOR);
- }
-
- @Override
- public String getFeature(int wordId, int... fields) {
- String[] allFeatures = getAllFeaturesArray(wordId);
- StringBuilder sb = new StringBuilder();
-
- if(fields.length == 0){ // All features
- for(String feature : allFeatures) {
- sb.append(CSVUtil.quoteEscape(feature)).append(",");
- }
- } else if(fields.length == 1) { // One feature doesn't need to escape value
- sb.append(allFeatures[fields[0]]).append(",");
- } else {
- for(int field : fields){
- sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
- }
- }
-
- return sb.deleteCharAt(sb.length() - 1).toString();
- }
-
- @Override
- public String getReading(int wordId) {
- return getFeature(wordId, 7);
- }
-
- @Override
- public String getAllFeatures(int wordId) {
- return getFeature(wordId);
- }
-
- @Override
- public String getPartOfSpeech(int wordId) {
- return getFeature(wordId, 0, 1, 2, 3);
- }
-
-
- /**
- * Write dictionary in file
- * Dictionary format is:
- * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
- * @param filename
- * @throws IOException
- */
- public void write(String directoryname) throws IOException {
- writeDictionary(directoryname + File.separator + FILENAME);
- writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
- }
-
- protected void writeTargetMap(String filename) throws IOException {
- ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
- oos.writeObject(targetMap);
- oos.close();
- }
-
- protected void writeDictionary(String filename) throws IOException {
- FileOutputStream fos = new FileOutputStream(filename);
- DataOutputStream dos = new DataOutputStream(fos);
- dos.writeInt(buffer.position());
- WritableByteChannel channel = Channels.newChannel(fos);
- // Write Buffer
- buffer.flip(); // set position to 0, set limit to current position
- channel.write(buffer);
-
- fos.close();
- }
-
- /**
- * Read dictionary into directly allocated buffer.
- * @return TokenInfoDictionary instance
- * @throws IOException
- * @throws ClassNotFoundException
- */
- public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
- TokenInfoDictionary dictionary = new TokenInfoDictionary();
- ClassLoader loader = dictionary.getClass().getClassLoader();
- dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
- dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
- return dictionary;
- }
-
- protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
- ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
- targetMap = (int[][]) ois.readObject();
- is.close();
- }
-
- protected void loadDictionary(InputStream is) throws IOException {
- DataInputStream dis = new DataInputStream(is);
- int size = dis.readInt();
-
- ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
-
- ReadableByteChannel channel = Channels.newChannel(is);
- channel.read(tmpBuffer);
- is.close();
- buffer = tmpBuffer.asReadOnlyBuffer();
- }
-
+
+ public static final String FILENAME = "tid.dat";
+
+ public static final String TARGETMAP_FILENAME = "tid_map.dat";
+
+ protected ByteBuffer buffer;
+
+ protected int[][] targetMap;
+
+ public TokenInfoDictionary() {
+ }
+
+ public TokenInfoDictionary(int size) {
+ targetMap = new int[1][];
+ buffer = ByteBuffer.allocate(size);
+ }
+
+ /**
+ * put the entry in map
+ * @param wordId
+ * @param entry
+ * @return current position of buffer, which will be wordId of next entry
+ */
+ public int put(String[] entry) {
+ short leftId = Short.parseShort(entry[1]);
+ short rightId = Short.parseShort(entry[2]);
+ short wordCost = Short.parseShort(entry[3]);
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 4; i < entry.length; i++){
+ sb.append(entry[i]).append(INTERNAL_SEPARATOR);
+ }
+ String features = sb.deleteCharAt(sb.length() - 1).toString();
+ int featuresSize = features.length()* 2;
+
+ // extend buffer if necessary
+ int left = buffer.limit() - buffer.position();
+ if (8 + featuresSize > left) { // four short and features
+ ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
+ buffer.flip();
+ newBuffer.put(buffer);
+ buffer = newBuffer;
+ }
+
+ buffer.putShort(leftId);
+ buffer.putShort(rightId);
+ buffer.putShort(wordCost);
+ buffer.putShort((short)featuresSize);
+ for (char c : features.toCharArray()){
+ buffer.putChar(c);
+ }
+
+ return buffer.position();
+ }
+
+ public void addMapping(int sourceId, int wordId) {
+ if(targetMap.length <= sourceId) {
+ int[][] newArray = new int[sourceId + 1][];
+ System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
+ targetMap = newArray;
+ }
+
+ // Prepare array -- extend the length of array by one
+ int[] current = targetMap[sourceId];
+ if (current == null) {
+ current = new int[1];
+ } else {
+ int[] newArray = new int[current.length + 1];
+ System.arraycopy(current, 0, newArray, 0, current.length);
+ current = newArray;
+ }
+ targetMap[sourceId] = current;
+
+ int[] targets = targetMap[sourceId];
+ targets[targets.length - 1] = wordId;
+ }
+
+ public int[] lookupWordIds(int sourceId) {
+ return targetMap[sourceId];
+ }
+
+ @Override
+ public int getLeftId(int wordId) {
+ return buffer.getShort(wordId);
+ }
+
+ @Override
+ public int getRightId(int wordId) {
+ return buffer.getShort(wordId + 2); // Skip left id
+ }
+
+ @Override
+ public int getWordCost(int wordId) {
+ return buffer.getShort(wordId + 4); // Skip left id and right id
+ }
+
+ @Override
+ public String[] getAllFeaturesArray(int wordId) {
+ int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
+ char[] targetArr = new char[size];
+ int offset = wordId + 6 + 2; // offset is position where features string starts
+ for(int i = 0; i < size; i++){
+ targetArr[i] = buffer.getChar(offset + i * 2);
+ }
+ String allFeatures = new String(targetArr);
+ return allFeatures.split(INTERNAL_SEPARATOR);
+ }
+
+ @Override
+ public String getFeature(int wordId, int... fields) {
+ String[] allFeatures = getAllFeaturesArray(wordId);
+ StringBuilder sb = new StringBuilder();
+
+ if(fields.length == 0){ // All features
+ for(String feature : allFeatures) {
+ sb.append(CSVUtil.quoteEscape(feature)).append(",");
+ }
+ } else if(fields.length == 1) { // One feature doesn't need to escape value
+ sb.append(allFeatures[fields[0]]).append(",");
+ } else {
+ for(int field : fields){
+ sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+ }
+ }
+
+ return sb.deleteCharAt(sb.length() - 1).toString();
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ return getFeature(wordId, 7);
+ }
+
+ @Override
+ public String getAllFeatures(int wordId) {
+ return getFeature(wordId);
+ }
+
+ @Override
+ public String getPartOfSpeech(int wordId) {
+ return getFeature(wordId, 0, 1, 2, 3);
+ }
+
+
+ /**
+ * Write dictionary in file
+ * Dictionary format is:
+ * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+ * @param filename
+ * @throws IOException
+ */
+ public void write(String directoryname) throws IOException {
+ writeDictionary(directoryname + File.separator + FILENAME);
+ writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+ }
+
+ protected void writeTargetMap(String filename) throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+ oos.writeObject(targetMap);
+ oos.close();
+ }
+
+ protected void writeDictionary(String filename) throws IOException {
+ FileOutputStream fos = new FileOutputStream(filename);
+ DataOutputStream dos = new DataOutputStream(fos);
+ dos.writeInt(buffer.position());
+ WritableByteChannel channel = Channels.newChannel(fos);
+ // Write Buffer
+ buffer.flip(); // set position to 0, set limit to current position
+ channel.write(buffer);
+
+ fos.close();
+ }
+
+ /**
+ * Read dictionary into directly allocated buffer.
+ * @return TokenInfoDictionary instance
+ * @throws IOException
+ * @throws ClassNotFoundException
+ */
+ public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
+ TokenInfoDictionary dictionary = new TokenInfoDictionary();
+ ClassLoader loader = dictionary.getClass().getClassLoader();
+ dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+ dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+ return dictionary;
+ }
+
+ protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
+ ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+ targetMap = (int[][]) ois.readObject();
+ is.close();
+ }
+
+ protected void loadDictionary(InputStream is) throws IOException {
+ DataInputStream dis = new DataInputStream(is);
+ int size = dis.readInt();
+
+ ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
+
+ ReadableByteChannel channel = Channels.newChannel(is);
+ channel.read(tmpBuffer);
+ is.close();
+ buffer = tmpBuffer.asReadOnlyBuffer();
+ }
+
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Tue Jan 3 04:33:56 2012
@@ -29,114 +29,114 @@ import java.io.ObjectOutputStream;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition.CharacterClass;
public class UnknownDictionary extends TokenInfoDictionary {
-
- public static final String FILENAME = "unk.dat";
-
- public static final String TARGETMAP_FILENAME = "unk_map.dat";
-
- public static final String CHARDEF_FILENAME = "cd.dat";
-
- private CharacterDefinition characterDefinition;
-
- /**
- * Constructor
- */
- public UnknownDictionary() {
- }
+
+ public static final String FILENAME = "unk.dat";
+
+ public static final String TARGETMAP_FILENAME = "unk_map.dat";
+
+ public static final String CHARDEF_FILENAME = "cd.dat";
+
+ private CharacterDefinition characterDefinition;
+
+ /**
+ * Constructor
+ */
+ public UnknownDictionary() {
+ }
+
+ public UnknownDictionary(int size) {
+ super(size);
+ characterDefinition = new CharacterDefinition();
+ }
+
+ @Override
+ public int put(String[] entry) {
+ // Get wordId of current entry
+ int wordId = buffer.position();
- public UnknownDictionary(int size) {
- super(size);
- characterDefinition = new CharacterDefinition();
- }
+ // Put entry
+ int result = super.put(entry);
- @Override
- public int put(String[] entry) {
- // Get wordId of current entry
- int wordId = buffer.position();
-
- // Put entry
- int result = super.put(entry);
-
- // Put entry in targetMap
- int characterId = CharacterClass.valueOf(entry[0]).getId();
- addMapping(characterId, wordId);
- return result;
+ // Put entry in targetMap
+ int characterId = CharacterClass.valueOf(entry[0]).getId();
+ addMapping(characterId, wordId);
+ return result;
+ }
+
+ public int lookup(String text) {
+ if(!characterDefinition.isGroup(text.charAt(0))) {
+ return 1;
}
- public int lookup(String text) {
- if(!characterDefinition.isGroup(text.charAt(0))) {
- return 1;
- }
-
- // Extract unknown word. Characters with the same character class are considered to be part of unknown word
- int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
- int length = 1;
- for (int i = 1; i < text.length(); i++) {
- if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
- length++;
- } else {
- break;
- }
- }
-
- return length;
+ // Extract unknown word. Characters with the same character class are considered to be part of unknown word
+ int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
+ int length = 1;
+ for (int i = 1; i < text.length(); i++) {
+ if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
+ length++;
+ } else {
+ break;
+ }
}
-
- /**
- * Put mapping from unicode code point to character class.
- *
- * @param codePoint code point
- * @param class character class name
- */
- public void putCharacterCategory(int codePoint, String characterClassName) {
- characterDefinition.putCharacterCategory(codePoint, characterClassName);
- }
-
- public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
- characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
- }
-
-
- public CharacterDefinition getCharacterDefinition() {
- return characterDefinition;
- }
-
- /**
- * Write dictionary in file
- * Dictionary format is:
- * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
- * @param filename
- * @throws IOException
- */
- public void write(String directoryname) throws IOException {
- writeDictionary(directoryname + File.separator + FILENAME);
- writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
- writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
- }
-
- protected void writeCharDef(String filename) throws IOException {
- ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
- oos.writeObject(characterDefinition);
- oos.close();
- }
-
- public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
- UnknownDictionary dictionary = new UnknownDictionary();
- ClassLoader loader = dictionary.getClass().getClassLoader();
- dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
- dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
- dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));
- return dictionary;
- }
-
- protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
- ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
- characterDefinition = (CharacterDefinition) ois.readObject();
- ois.close();
- }
-
- @Override
- public String getReading(int wordId) {
- return null;
- }
+
+ return length;
+ }
+
+ /**
+ * Put mapping from unicode code point to character class.
+ *
+ * @param codePoint code point
+ * @param class character class name
+ */
+ public void putCharacterCategory(int codePoint, String characterClassName) {
+ characterDefinition.putCharacterCategory(codePoint, characterClassName);
+ }
+
+ public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+ characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
+ }
+
+
+ public CharacterDefinition getCharacterDefinition() {
+ return characterDefinition;
+ }
+
+ /**
+ * Write dictionary in file
+ * Dictionary format is:
+ * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+ * @param filename
+ * @throws IOException
+ */
+ public void write(String directoryname) throws IOException {
+ writeDictionary(directoryname + File.separator + FILENAME);
+ writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
+ writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
+ }
+
+ protected void writeCharDef(String filename) throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
+ oos.writeObject(characterDefinition);
+ oos.close();
+ }
+
+ public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
+ UnknownDictionary dictionary = new UnknownDictionary();
+ ClassLoader loader = dictionary.getClass().getClassLoader();
+ dictionary.loadDictionary(loader.getResourceAsStream(FILENAME));
+ dictionary.loadTargetMap(loader.getResourceAsStream(TARGETMAP_FILENAME));
+ dictionary.loadCharDef(loader.getResourceAsStream(CHARDEF_FILENAME));
+ return dictionary;
+ }
+
+ protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
+ ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
+ characterDefinition = (CharacterDefinition) ois.readObject();
+ ois.close();
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ return null;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Tue Jan 3 04:33:56 2012
@@ -30,167 +30,167 @@ import java.util.TreeMap;
import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
public class UserDictionary implements Dictionary {
-
- private TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
-
- private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>();
-
- private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
-
- public static final int WORD_COST = -100000;
-
- public static final int LEFT_ID = 5;
-
- public static final int RIGHT_ID = 5;
-
- public UserDictionary() {
-
- }
-
- /**
- * Lookup words in text
- * @param text
- * @return array of {wordId, position, length}
- */
- public int[][] lookup(String text) {
- TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
-
- for (String keyword : entries.descendingKeySet()) {
- int offset = 0;
- int position = text.indexOf(keyword, offset);
- while (offset < text.length() && position >= 0) {
- if(!result.containsKey(position)){
- result.put(position, entries.get(keyword));
- }
- offset += position + keyword.length();
- position = text.indexOf(keyword, offset);
- }
- }
-
- return toIndexArray(result);
- }
-
- /**
- * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
- * @param input
- * @return array of {wordId, index, length}
- */
- private int[][] toIndexArray(Map<Integer, int[]> input) {
- ArrayList<int[]> result = new ArrayList<int[]>();
- for (int i : input.keySet()) {
- int[] wordIdAndLength = input.get(i);
- int wordId = wordIdAndLength[0];
- // convert length to index
- int current = i;
- for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
- int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
- result.add(token);
- current += wordIdAndLength[j];
- }
- }
- return result.toArray(new int[result.size()][]);
- }
-
- @Override
- public int getLeftId(int wordId) {
- return LEFT_ID;
- }
-
- @Override
- public int getRightId(int wordId) {
- return RIGHT_ID;
- }
-
- @Override
- public int getWordCost(int wordId) {
- return WORD_COST;
- }
-
- @Override
- public String getReading(int wordId) {
- return getFeature(wordId, 0);
- }
-
- @Override
- public String getPartOfSpeech(int wordId) {
- return getFeature(wordId, 1);
- }
-
- @Override
- public String getAllFeatures(int wordId) {
- return getFeature(wordId);
- }
-
- @Override
- public String[] getAllFeaturesArray(int wordId) {
- String allFeatures = featureEntries.get(wordId);
- if(allFeatures == null) {
- return null;
- }
-
- return allFeatures.split(INTERNAL_SEPARATOR);
- }
-
-
- @Override
- public String getFeature(int wordId, int... fields) {
- String[] allFeatures = getAllFeaturesArray(wordId);
- if (allFeatures == null) {
- return null;
- }
- StringBuilder sb = new StringBuilder();
- if (fields.length == 0) { // All features
- for (String feature : allFeatures) {
- sb.append(CSVUtil.quoteEscape(feature)).append(",");
- }
- } else if (fields.length == 1) { // One feature doesn't need to escape value
- sb.append(allFeatures[fields[0]]).append(",");
- } else {
- for (int field : fields){
- sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
- }
- }
- return sb.deleteCharAt(sb.length() - 1).toString();
- }
-
- public static UserDictionary read(String filename) throws IOException {
- return read(new FileInputStream(filename));
- }
-
- public static UserDictionary read(InputStream is) throws IOException {
- UserDictionary dictionary = new UserDictionary();
- BufferedReader reader = new BufferedReader(new InputStreamReader(is));
- String line = null;
- int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
- while ((line = reader.readLine()) != null) {
- // Remove comments
- line = line.replaceAll("#.*$", "");
-
- // Skip empty lines or comment lines
- if (line.trim().length() == 0) {
- continue;
- }
- String[] values = CSVUtil.parse(line);
- String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
- String[] readings = values[2].replaceAll(" *", " ").split(" ");
- String pos = values[3];
-
- if (segmentation.length != readings.length) {
- // FIXME: Should probably deal with this differently. Exception?
- System.out.println("This entry is not properly formatted : " + line);
- }
-
- int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
- wordIdAndLength[0] = wordId;
- for (int i = 0; i < segmentation.length; i++) {
- wordIdAndLength[i + 1] = segmentation[i].length();
- dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos);
- wordId++;
- }
- dictionary.entries.put(values[0], wordIdAndLength);
- }
- reader.close();
- return dictionary;
- }
-
+
+ private TreeMap<String, int[]> entries = new TreeMap<String, int[]>();
+
+ private HashMap<Integer, String> featureEntries = new HashMap<Integer, String>();
+
+ private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
+
+ public static final int WORD_COST = -100000;
+
+ public static final int LEFT_ID = 5;
+
+ public static final int RIGHT_ID = 5;
+
+ public UserDictionary() {
+
+ }
+
+ /**
+ * Lookup words in text
+ * @param text
+ * @return array of {wordId, position, length}
+ */
+ public int[][] lookup(String text) {
+ TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
+
+ for (String keyword : entries.descendingKeySet()) {
+ int offset = 0;
+ int position = text.indexOf(keyword, offset);
+ while (offset < text.length() && position >= 0) {
+ if(!result.containsKey(position)){
+ result.put(position, entries.get(keyword));
+ }
+ offset += position + keyword.length();
+ position = text.indexOf(keyword, offset);
+ }
+ }
+
+ return toIndexArray(result);
+ }
+
+ /**
+ * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
+ * @param input
+ * @return array of {wordId, index, length}
+ */
+ private int[][] toIndexArray(Map<Integer, int[]> input) {
+ ArrayList<int[]> result = new ArrayList<int[]>();
+ for (int i : input.keySet()) {
+ int[] wordIdAndLength = input.get(i);
+ int wordId = wordIdAndLength[0];
+ // convert length to index
+ int current = i;
+ for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
+ int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
+ result.add(token);
+ current += wordIdAndLength[j];
+ }
+ }
+ return result.toArray(new int[result.size()][]);
+ }
+
+ @Override
+ public int getLeftId(int wordId) {
+ return LEFT_ID;
+ }
+
+ @Override
+ public int getRightId(int wordId) {
+ return RIGHT_ID;
+ }
+
+ @Override
+ public int getWordCost(int wordId) {
+ return WORD_COST;
+ }
+
+ @Override
+ public String getReading(int wordId) {
+ return getFeature(wordId, 0);
+ }
+
+ @Override
+ public String getPartOfSpeech(int wordId) {
+ return getFeature(wordId, 1);
+ }
+
+ @Override
+ public String getAllFeatures(int wordId) {
+ return getFeature(wordId);
+ }
+
+ @Override
+ public String[] getAllFeaturesArray(int wordId) {
+ String allFeatures = featureEntries.get(wordId);
+ if(allFeatures == null) {
+ return null;
+ }
+
+ return allFeatures.split(INTERNAL_SEPARATOR);
+ }
+
+
+ @Override
+ public String getFeature(int wordId, int... fields) {
+ String[] allFeatures = getAllFeaturesArray(wordId);
+ if (allFeatures == null) {
+ return null;
+ }
+ StringBuilder sb = new StringBuilder();
+ if (fields.length == 0) { // All features
+ for (String feature : allFeatures) {
+ sb.append(CSVUtil.quoteEscape(feature)).append(",");
+ }
+ } else if (fields.length == 1) { // One feature doesn't need to escape value
+ sb.append(allFeatures[fields[0]]).append(",");
+ } else {
+ for (int field : fields){
+ sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
+ }
+ }
+ return sb.deleteCharAt(sb.length() - 1).toString();
+ }
+
+ public static UserDictionary read(String filename) throws IOException {
+ return read(new FileInputStream(filename));
+ }
+
+ public static UserDictionary read(InputStream is) throws IOException {
+ UserDictionary dictionary = new UserDictionary();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+ String line = null;
+ int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
+ while ((line = reader.readLine()) != null) {
+ // Remove comments
+ line = line.replaceAll("#.*$", "");
+
+ // Skip empty lines or comment lines
+ if (line.trim().length() == 0) {
+ continue;
+ }
+ String[] values = CSVUtil.parse(line);
+ String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
+ String[] readings = values[2].replaceAll(" *", " ").split(" ");
+ String pos = values[3];
+
+ if (segmentation.length != readings.length) {
+ // FIXME: Should probably deal with this differently. Exception?
+ System.out.println("This entry is not properly formatted : " + line);
+ }
+
+ int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
+ wordIdAndLength[0] = wordId;
+ for (int i = 0; i < segmentation.length; i++) {
+ wordIdAndLength[i + 1] = segmentation[i].length();
+ dictionary.featureEntries.put(wordId, readings[i] + INTERNAL_SEPARATOR + pos);
+ wordId++;
+ }
+ dictionary.entries.put(values[0], wordIdAndLength);
+ }
+ reader.close();
+ return dictionary;
+ }
+
}