You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/03 05:33:57 UTC
svn commit: r1226637 [3/3] - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucen...
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Tue Jan 3 04:33:56 2012
@@ -31,323 +31,323 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class Viterbi {
-
- private final DoubleArrayTrie trie;
-
- private final TokenInfoDictionary dictionary;
-
- private final UnknownDictionary unkDictionary;
-
- private final ConnectionCosts costs;
-
- private final UserDictionary userDictionary;
-
- private final CharacterDefinition characterDefinition;
-
- private final boolean useUserDictionary;
-
- private final boolean searchMode;
-
- private final boolean extendedMode;
-
- private static final int DEFAULT_COST = 10000000;
-
- private static final int SEARCH_MODE_LENGTH_KANJI = 3;
-
- private static final int SEARCH_MODE_LENGTH = 7;
-
- private static final int SEARCH_MODE_PENALTY = 10000;
-
- private static final String BOS = "BOS";
-
- private static final String EOS = "EOS";
-
- /**
- * Constructor
- * @param trie
- * @param targetMap
- * @param dictionary
- * @param unkDictionary
- * @param costs
- * @param userDictionary
- */
- public Viterbi(DoubleArrayTrie trie,
- TokenInfoDictionary dictionary,
- UnknownDictionary unkDictionary,
- ConnectionCosts costs,
- UserDictionary userDictionary,
- Mode mode) {
- this.trie = trie;
- this.dictionary = dictionary;
- this.unkDictionary = unkDictionary;
- this.costs = costs;
- this.userDictionary = userDictionary;
- if(userDictionary == null) {
- this.useUserDictionary = false;
- } else {
- this.useUserDictionary = true;
- }
-
- switch(mode){
- case SEARCH:
- searchMode = true;
- extendedMode = false;
- break;
- case EXTENDED:
- searchMode = true;
- extendedMode = true;
- break;
- default:
- searchMode = false;
- extendedMode = false;
- break;
- }
-
- this.characterDefinition = unkDictionary.getCharacterDefinition();
- }
-
- /**
- * Find best path from input lattice.
- * @param lattice the result of build method
- * @return List of ViterbiNode which consist best path
- */
- public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
- ViterbiNode[][] startIndexArr = lattice[0];
- ViterbiNode[][] endIndexArr = lattice[1];
-
- for (int i = 1; i < startIndexArr.length; i++){
-
- if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
- continue;
- }
-
- for (ViterbiNode node : startIndexArr[i]) {
- if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
- break;
- }
-
- int backwardConnectionId = node.getLeftId();
- int wordCost = node.getWordCost();
- int leastPathCost = DEFAULT_COST;
- for (ViterbiNode leftNode : endIndexArr[i]) {
- if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
- break;
- }
-
- int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
-
- // "Search mode". Add extra costs if it is long node.
- if (searchMode) {
-// System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
- String surfaceForm = node.getSurfaceForm();
- int length = surfaceForm.length();
- if (length > SEARCH_MODE_LENGTH_KANJI) {
- boolean allKanji = true;
- // check if node consists of only kanji
- for (int pos = 0; pos < length; pos++) {
- if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
- allKanji = false;
- break;
- }
- }
-
- if (allKanji) { // Process only Kanji keywords
- pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
- } else if (length > SEARCH_MODE_LENGTH) {
- pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;
- }
- }
- }
-
- if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
- leastPathCost = pathCost;
- node.setPathCost(leastPathCost);
- node.setLeftNode(leftNode);
- }
- }
- }
- }
-
- // track best path
- ViterbiNode node = endIndexArr[0][0]; // EOS
- LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
- result.add(node);
- while (true) {
- ViterbiNode leftNode = node.getLeftNode();
- if (leftNode == null) {
- break;
- }
-
- // EXTENDED mode convert unknown word into unigram node
- if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
- int unigramWordId = CharacterClass.NGRAM.getId();
- int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
- int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
- int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
- String surfaceForm = leftNode.getSurfaceForm();
- for (int i = surfaceForm.length(); i > 0; i--) {
- ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
- result.addFirst(uniGramNode);
- }
- } else {
- result.addFirst(leftNode);
- }
- node = leftNode;
- }
-
- return result;
- }
-
-
- /**
- * Build lattice from input text
- * @param text
- * @return
- */
- public ViterbiNode[][][] build(String text) {
- int textLength = text.length();
- ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS
- ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS
- int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
- int[] endSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in endIndexArr
-
- ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
- addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
-
- // Process user dictionary;
- if (useUserDictionary) {
- processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
- }
-
- int unknownWordEndIndex = -1; // index of the last character of unknown word
-
- for (int startIndex = 0; startIndex < textLength; startIndex++) {
- // If no token ends where current token starts, skip this index
- if (endSizeArr[startIndex + 1] == 0) {
- continue;
- }
-
- String suffix = text.substring(startIndex);
-
- boolean found = false;
- for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
- String prefix = suffix.substring(0, endIndex);
-
- int result = trie.lookup(prefix);
-
- if (result > 0) { // Found match in double array trie
- found = true; // Don't produce unknown word starting from this index
- for (int wordId : dictionary.lookupWordIds(result)) {
- ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
- addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
- }
- } else if(result < 0) { // If result is less than zero, continue to next position
- break;
- }
- }
-
- // In the case of normal mode, it doesn't process unknown word greedily.
- if(!searchMode && unknownWordEndIndex > startIndex){
- continue;
- }
-
- // Process Unknown Word
- int unknownWordLength = 0;
- char firstCharacter = suffix.charAt(0);
- boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
- if (isInvoke){ // Process "invoke"
- unknownWordLength = unkDictionary.lookup(suffix);
- } else if (found == false){ // Process not "invoke"
- unknownWordLength = unkDictionary.lookup(suffix);
- }
-
- if (unknownWordLength > 0) { // found unknown word
- String unkWord = suffix.substring(0, unknownWordLength);
- int characterId = characterDefinition.lookup(firstCharacter);
- int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same
-
- for (int wordId : wordIds) {
- ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
- addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
- }
- unknownWordEndIndex = startIndex + unknownWordLength;
- }
- }
-
- ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
- addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
-
- ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
-
- return result;
- }
-
- /**
- * Find token(s) in input text and set found token(s) in arrays as normal tokens
- * @param text
- * @param startIndexArr
- * @param endIndexArr
- * @param startSizeArr
- * @param endSizeArr
- */
- private void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
- int[][] result = userDictionary.lookup(text);
- for(int[] segmentation : result) {
- int wordId = segmentation[0];
- int index = segmentation[1];
- int length = segmentation[2];
- ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
- addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
- }
- }
-
- /**
- * Add node to arrays and increment count in size array
- * @param node
- * @param startIndex
- * @param endIndex
- * @param startIndexArr
- * @param endIndexArr
- * @param startSizeArr
- * @param endSizeArr
- */
- private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
- int startNodesCount = startSizeArr[startIndex];
- int endNodesCount = endSizeArr[endIndex];
-
- if (startNodesCount == 0) {
- startIndexArr[startIndex] = new ViterbiNode[10];
- }
-
- if (endNodesCount == 0) {
- endIndexArr[endIndex] = new ViterbiNode[10];
- }
-
- if (startIndexArr[startIndex].length <= startNodesCount){
- startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
- }
-
- if (endIndexArr[endIndex].length <= endNodesCount){
- endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
- }
-
- startIndexArr[startIndex][startNodesCount] = node;
- endIndexArr[endIndex][endNodesCount] = node;
-
- startSizeArr[startIndex] = startNodesCount + 1;
- endSizeArr[endIndex] = endNodesCount + 1;
- }
-
-
- /**
- * Return twice as big array which contains value of input array
- * @param array
- * @return
- */
- private ViterbiNode[] extendArray(ViterbiNode[] array) {
- //extend array
- ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
- System.arraycopy(array, 0, newArray, 0, array.length);
- return newArray;
- }
+
+ private final DoubleArrayTrie trie;
+
+ private final TokenInfoDictionary dictionary;
+
+ private final UnknownDictionary unkDictionary;
+
+ private final ConnectionCosts costs;
+
+ private final UserDictionary userDictionary;
+
+ private final CharacterDefinition characterDefinition;
+
+ private final boolean useUserDictionary;
+
+ private final boolean searchMode;
+
+ private final boolean extendedMode;
+
+ private static final int DEFAULT_COST = 10000000;
+
+ private static final int SEARCH_MODE_LENGTH_KANJI = 3;
+
+ private static final int SEARCH_MODE_LENGTH = 7;
+
+ private static final int SEARCH_MODE_PENALTY = 10000;
+
+ private static final String BOS = "BOS";
+
+ private static final String EOS = "EOS";
+
+ /**
+ * Constructor
+ * @param trie
+ * @param targetMap
+ * @param dictionary
+ * @param unkDictionary
+ * @param costs
+ * @param userDictionary
+ */
+ public Viterbi(DoubleArrayTrie trie,
+ TokenInfoDictionary dictionary,
+ UnknownDictionary unkDictionary,
+ ConnectionCosts costs,
+ UserDictionary userDictionary,
+ Mode mode) {
+ this.trie = trie;
+ this.dictionary = dictionary;
+ this.unkDictionary = unkDictionary;
+ this.costs = costs;
+ this.userDictionary = userDictionary;
+ if(userDictionary == null) {
+ this.useUserDictionary = false;
+ } else {
+ this.useUserDictionary = true;
+ }
+
+ switch(mode){
+ case SEARCH:
+ searchMode = true;
+ extendedMode = false;
+ break;
+ case EXTENDED:
+ searchMode = true;
+ extendedMode = true;
+ break;
+ default:
+ searchMode = false;
+ extendedMode = false;
+ break;
+ }
+
+ this.characterDefinition = unkDictionary.getCharacterDefinition();
+ }
+
+ /**
+ * Find best path from input lattice.
+ * @param lattice the result of build method
+ * @return List of ViterbiNode which consist best path
+ */
+ public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
+ ViterbiNode[][] startIndexArr = lattice[0];
+ ViterbiNode[][] endIndexArr = lattice[1];
+
+ for (int i = 1; i < startIndexArr.length; i++){
+
+ if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
+ continue;
+ }
+
+ for (ViterbiNode node : startIndexArr[i]) {
+ if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
+ break;
+ }
+
+ int backwardConnectionId = node.getLeftId();
+ int wordCost = node.getWordCost();
+ int leastPathCost = DEFAULT_COST;
+ for (ViterbiNode leftNode : endIndexArr[i]) {
+ if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
+ break;
+ }
+
+ int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
+
+ // "Search mode". Add extra costs if it is long node.
+ if (searchMode) {
+ // System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
+ String surfaceForm = node.getSurfaceForm();
+ int length = surfaceForm.length();
+ if (length > SEARCH_MODE_LENGTH_KANJI) {
+ boolean allKanji = true;
+ // check if node consists of only kanji
+ for (int pos = 0; pos < length; pos++) {
+ if (!characterDefinition.isKanji(surfaceForm.charAt(pos))){
+ allKanji = false;
+ break;
+ }
+ }
+
+ if (allKanji) { // Process only Kanji keywords
+ pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
+ } else if (length > SEARCH_MODE_LENGTH) {
+ pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;
+ }
+ }
+ }
+
+ if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
+ leastPathCost = pathCost;
+ node.setPathCost(leastPathCost);
+ node.setLeftNode(leftNode);
+ }
+ }
+ }
+ }
+
+ // track best path
+ ViterbiNode node = endIndexArr[0][0]; // EOS
+ LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
+ result.add(node);
+ while (true) {
+ ViterbiNode leftNode = node.getLeftNode();
+ if (leftNode == null) {
+ break;
+ }
+
+ // EXTENDED mode convert unknown word into unigram node
+ if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
+ int unigramWordId = CharacterClass.NGRAM.getId();
+ int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
+ int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
+ int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
+ String surfaceForm = leftNode.getSurfaceForm();
+ for (int i = surfaceForm.length(); i > 0; i--) {
+ ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm.substring(i - 1, i), unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i - 1, Type.UNKNOWN);
+ result.addFirst(uniGramNode);
+ }
+ } else {
+ result.addFirst(leftNode);
+ }
+ node = leftNode;
+ }
+
+ return result;
+ }
+
+
+ /**
+ * Build lattice from input text
+ * @param text
+ * @return
+ */
+ public ViterbiNode[][][] build(String text) {
+ int textLength = text.length();
+ ViterbiNode[][] startIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS
+ ViterbiNode[][] endIndexArr = new ViterbiNode[textLength + 2][]; // text length + BOS and EOS
+ int[] startSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in startIndexArr
+ int[] endSizeArr = new int[textLength + 2]; // array to keep ViterbiNode count in endIndexArr
+
+ ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, 0, 0, -1, Type.KNOWN);
+ addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+
+ // Process user dictionary;
+ if (useUserDictionary) {
+ processUserDictionary(text, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+
+ int unknownWordEndIndex = -1; // index of the last character of unknown word
+
+ for (int startIndex = 0; startIndex < textLength; startIndex++) {
+ // If no token ends where current token starts, skip this index
+ if (endSizeArr[startIndex + 1] == 0) {
+ continue;
+ }
+
+ String suffix = text.substring(startIndex);
+
+ boolean found = false;
+ for (int endIndex = 1; endIndex < suffix.length() + 1; endIndex++) {
+ String prefix = suffix.substring(0, endIndex);
+
+ int result = trie.lookup(prefix);
+
+ if (result > 0) { // Found match in double array trie
+ found = true; // Don't produce unknown word starting from this index
+ for (int wordId : dictionary.lookupWordIds(result)) {
+ ViterbiNode node = new ViterbiNode(wordId, prefix, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
+ addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+ } else if(result < 0) { // If result is less than zero, continue to next position
+ break;
+ }
+ }
+
+ // In the case of normal mode, it doesn't process unknown word greedily.
+ if(!searchMode && unknownWordEndIndex > startIndex){
+ continue;
+ }
+
+ // Process Unknown Word
+ int unknownWordLength = 0;
+ char firstCharacter = suffix.charAt(0);
+ boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
+ if (isInvoke){ // Process "invoke"
+ unknownWordLength = unkDictionary.lookup(suffix);
+ } else if (found == false){ // Process not "invoke"
+ unknownWordLength = unkDictionary.lookup(suffix);
+ }
+
+ if (unknownWordLength > 0) { // found unknown word
+ String unkWord = suffix.substring(0, unknownWordLength);
+ int characterId = characterDefinition.lookup(firstCharacter);
+ int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input text are supposed to be the same
+
+ for (int wordId : wordIds) {
+ ViterbiNode node = new ViterbiNode(wordId, unkWord, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
+ addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+ unknownWordEndIndex = startIndex + unknownWordLength;
+ }
+ }
+
+ ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, 0, 0, textLength + 1, Type.KNOWN);
+ addToArrays(eosNode, textLength + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
+
+ ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
+
+ return result;
+ }
+
+ /**
+ * Find token(s) in input text and set found token(s) in arrays as normal tokens
+ * @param text
+ * @param startIndexArr
+ * @param endIndexArr
+ * @param startSizeArr
+ * @param endSizeArr
+ */
+ private void processUserDictionary(String text, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) {
+ int[][] result = userDictionary.lookup(text);
+ for(int[] segmentation : result) {
+ int wordId = segmentation[0];
+ int index = segmentation[1];
+ int length = segmentation[2];
+ ViterbiNode node = new ViterbiNode(wordId, text.substring(index, index + length), userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
+ addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+ }
+ }
+
+ /**
+ * Add node to arrays and increment count in size array
+ * @param node
+ * @param startIndex
+ * @param endIndex
+ * @param startIndexArr
+ * @param endIndexArr
+ * @param startSizeArr
+ * @param endSizeArr
+ */
+ private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
+ int startNodesCount = startSizeArr[startIndex];
+ int endNodesCount = endSizeArr[endIndex];
+
+ if (startNodesCount == 0) {
+ startIndexArr[startIndex] = new ViterbiNode[10];
+ }
+
+ if (endNodesCount == 0) {
+ endIndexArr[endIndex] = new ViterbiNode[10];
+ }
+
+ if (startIndexArr[startIndex].length <= startNodesCount){
+ startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
+ }
+
+ if (endIndexArr[endIndex].length <= endNodesCount){
+ endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
+ }
+
+ startIndexArr[startIndex][startNodesCount] = node;
+ endIndexArr[endIndex][endNodesCount] = node;
+
+ startSizeArr[startIndex] = startNodesCount + 1;
+ endSizeArr[endIndex] = endNodesCount + 1;
+ }
+
+
+ /**
+ * Return twice as big array which contains value of input array
+ * @param array
+ * @return
+ */
+ private ViterbiNode[] extendArray(ViterbiNode[] array) {
+ //extend array
+ ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
+ System.arraycopy(array, 0, newArray, 0, array.length);
+ return newArray;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java Tue Jan 3 04:33:56 2012
@@ -18,105 +18,105 @@ package org.apache.lucene.analysis.kurom
*/
public class ViterbiNode {
- public enum Type {
- KNOWN,
- UNKNOWN,
- USER
- }
-
- private final int wordId;
-
- private final String surfaceForm;
-
- private final int leftId;
-
- private final int rightId;
-
- /** word cost for this node */
- private final int wordCost;
-
- /** minimum path cost found thus far */
- private int pathCost;
-
- private ViterbiNode leftNode;
-
- private final Type type;
-
- private final int startIndex;
-
- public ViterbiNode(int wordId, String surfaceForm, int leftId, int rightId, int wordCost, int startIndex, Type type) {
- this.wordId = wordId;
- this.surfaceForm = surfaceForm;
- this.leftId = leftId;
- this.rightId = rightId;
- this.wordCost = wordCost;
- this.startIndex = startIndex;
- this.type = type;
- }
-
-
- /**
- * @return the wordId
- */
- public int getWordId() {
- return wordId;
- }
-
- /**
- * @return the surfaceForm
- */
- public String getSurfaceForm() {
- return surfaceForm;
- }
-
- /**
- * @return the leftId
- */
- public int getLeftId() {
- return leftId;
- }
-
- /**
- * @return the rightId
- */
- public int getRightId() {
- return rightId;
- }
-
- /**
- * @return the cost
- */
- public int getWordCost() {
- return wordCost;
- }
-
- /**
- * @return the cost
- */
- public int getPathCost() {
- return pathCost;
- }
-
- /**
- * param cost minimum path cost found this far
- */
- public void setPathCost(int pathCost) {
- this.pathCost = pathCost;
- }
-
- public void setLeftNode(ViterbiNode node) {
- leftNode = node;
- }
-
- public ViterbiNode getLeftNode() {
- return leftNode;
- }
-
- public int getStartIndex() {
- return startIndex;
- }
-
- public Type getType() {
- return type;
- }
+ public enum Type {
+ KNOWN,
+ UNKNOWN,
+ USER
+ }
+
+ private final int wordId;
+
+ private final String surfaceForm;
+
+ private final int leftId;
+
+ private final int rightId;
+
+ /** word cost for this node */
+ private final int wordCost;
+
+ /** minimum path cost found thus far */
+ private int pathCost;
+
+ private ViterbiNode leftNode;
+
+ private final Type type;
+
+ private final int startIndex;
+
+ public ViterbiNode(int wordId, String surfaceForm, int leftId, int rightId, int wordCost, int startIndex, Type type) {
+ this.wordId = wordId;
+ this.surfaceForm = surfaceForm;
+ this.leftId = leftId;
+ this.rightId = rightId;
+ this.wordCost = wordCost;
+ this.startIndex = startIndex;
+ this.type = type;
+ }
+
+
+ /**
+ * @return the wordId
+ */
+ public int getWordId() {
+ return wordId;
+ }
+
+ /**
+ * @return the surfaceForm
+ */
+ public String getSurfaceForm() {
+ return surfaceForm;
+ }
+
+ /**
+ * @return the leftId
+ */
+ public int getLeftId() {
+ return leftId;
+ }
+
+ /**
+ * @return the rightId
+ */
+ public int getRightId() {
+ return rightId;
+ }
+
+ /**
+ * @return the cost
+ */
+ public int getWordCost() {
+ return wordCost;
+ }
+
+ /**
+ * @return the cost
+ */
+ public int getPathCost() {
+ return pathCost;
+ }
+
+ /**
+ * param cost minimum path cost found this far
+ */
+ public void setPathCost(int pathCost) {
+ this.pathCost = pathCost;
+ }
+
+ public void setLeftNode(ViterbiNode node) {
+ leftNode = node;
+ }
+
+ public ViterbiNode getLeftNode() {
+ return leftNode;
+ }
+
+ public int getStartIndex() {
+ return startIndex;
+ }
+
+ public Type getType() {
+ return type;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Tue Jan 3 04:33:56 2012
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.kurom
* limitations under the License.
*/
-import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.List;
@@ -30,69 +29,69 @@ import org.junit.BeforeClass;
import org.junit.Test;
public class TokenizerTest extends LuceneTestCase {
-
- private static Tokenizer tokenizer;
-
- @BeforeClass
- public static void setUpBeforeClass() throws Exception {
- tokenizer = Tokenizer.builder().build();
- }
-
- @AfterClass
- public static void afterClass() throws Exception {
- tokenizer = null;
- }
-
- @Test
- public void testSegmentation() {
- // Skip tests for Michelle Kwan -- UniDic segments Kwan as 㯠ã¯ã³
-// String input = "ãã·ã§ã«ã»ã¯ã¯ã³ãåªåãã¾ãããã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
-// String[] surfaceForms = {
-// "ãã·ã§ã«", "ã»", "ã¯ã¯ã³", "ã", "åªå", "ã", "ã¾ã", "ã", "ã",
-// "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
-// "ãããããã", "ã"
-// };
- String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
- String[] surfaceForms = {
- "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
- "ãããããã", "ã"
- };
- List<Token> tokens = tokenizer.tokenize(input);
- assertTrue(tokens.size() == surfaceForms.length);
- for (int i = 0; i < tokens.size(); i++) {
- assertEquals(surfaceForms[i], tokens.get(i).getSurfaceForm());
- }
- }
-
-
- @Test
- public void testReadings() {
- List<Token> tokens = tokenizer.tokenize("寿å¸ãé£ã¹ããã§ãã");
- assertTrue(tokens.size() == 6);
- assertEquals(tokens.get(0).getReading(), "ã¹ã·");
- assertEquals(tokens.get(1).getReading(), "ã¬");
- assertEquals(tokens.get(2).getReading(), "ã¿ã");
- assertEquals(tokens.get(3).getReading(), "ã¿ã¤");
- assertEquals(tokens.get(4).getReading(), "ãã¹");
- assertEquals(tokens.get(5).getReading(), "ã");
- }
-
- public void testBocchan() throws Exception {
- doTestBocchan(1);
- }
-
- @Test @Nightly
- public void testBocchanBig() throws Exception {
- doTestBocchan(100);
- }
-
- private void doTestBocchan(int numIterations) throws Exception {
- LineNumberReader reader = new LineNumberReader(new InputStreamReader(
+
+ private static Tokenizer tokenizer;
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ tokenizer = Tokenizer.builder().build();
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ tokenizer = null;
+ }
+
+ @Test
+ public void testSegmentation() {
+ // Skip tests for Michelle Kwan -- UniDic segments Kwan as 㯠ã¯ã³
+ // String input = "ãã·ã§ã«ã»ã¯ã¯ã³ãåªåãã¾ãããã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+ // String[] surfaceForms = {
+ // "ãã·ã§ã«", "ã»", "ã¯ã¯ã³", "ã", "åªå", "ã", "ã¾ã", "ã", "ã",
+ // "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+ // "ãããããã", "ã"
+ // };
+ String input = "ã¹ãã¼ã¹ã¹ãã¼ã·ã§ã³ã«è¡ãã¾ããããããããã";
+ String[] surfaceForms = {
+ "ã¹ãã¼ã¹", "ã¹ãã¼ã·ã§ã³", "ã«", "è¡ã", "ã¾ã", "ã",
+ "ãããããã", "ã"
+ };
+ List<Token> tokens = tokenizer.tokenize(input);
+ assertTrue(tokens.size() == surfaceForms.length);
+ for (int i = 0; i < tokens.size(); i++) {
+ assertEquals(surfaceForms[i], tokens.get(i).getSurfaceForm());
+ }
+ }
+
+
+ @Test
+ public void testReadings() {
+ List<Token> tokens = tokenizer.tokenize("寿å¸ãé£ã¹ããã§ãã");
+ assertTrue(tokens.size() == 6);
+ assertEquals(tokens.get(0).getReading(), "ã¹ã·");
+ assertEquals(tokens.get(1).getReading(), "ã¬");
+ assertEquals(tokens.get(2).getReading(), "ã¿ã");
+ assertEquals(tokens.get(3).getReading(), "ã¿ã¤");
+ assertEquals(tokens.get(4).getReading(), "ãã¹");
+ assertEquals(tokens.get(5).getReading(), "ã");
+ }
+
+ public void testBocchan() throws Exception {
+ doTestBocchan(1);
+ }
+
+ @Test @Nightly
+ public void testBocchanBig() throws Exception {
+ doTestBocchan(100);
+ }
+
+ private void doTestBocchan(int numIterations) throws Exception {
+ LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getResourceAsStream("bocchan.utf-8")));
String line = reader.readLine();
reader.close();
-
+
if (VERBOSE) {
System.out.println("Test for Bocchan without pre-splitting sentences");
}
@@ -114,5 +113,5 @@ public class TokenizerTest extends Lucen
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
}
- }
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java Tue Jan 3 04:33:56 2012
@@ -28,109 +28,109 @@ import org.apache.lucene.util.LuceneTest
import org.junit.Test;
public class UnknownDictionaryTest extends LuceneTestCase {
- public static final String FILENAME = "unk-tokeninfo-dict.obj";
-
- @Test
- public void testPutCharacterCategory() {
- UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
-
- try{
- unkDic.putCharacterCategory(0, "DUMMY_NAME");
- fail();
- } catch(Exception e) {
-
- }
-
- try{
- unkDic.putCharacterCategory(-1, "KATAKANA");
- fail();
- } catch(Exception e) {
-
- }
-
- unkDic.putCharacterCategory(0, "DEFAULT");
- unkDic.putCharacterCategory(1, "GREEK");
- unkDic.putCharacterCategory(2, "HIRAGANA");
- unkDic.putCharacterCategory(3, "KATAKANA");
- unkDic.putCharacterCategory(4, "KANJI");
- }
-
- @Test
- public void testPut() {
- UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
- try{
- unkDic.put(CSVUtil.parse("KANJI,1285,11426,åè©,ä¸è¬,*,*,*,*,*"));
- fail();
- } catch(Exception e){
-
- }
-
- String entry1 = "KANJI,1285,1285,11426,åè©,ä¸è¬,*,*,*,*,*";
- String entry2 = "ALPHA,1285,1285,13398,åè©,ä¸è¬,*,*,*,*,*";
- String entry3 = "HIRAGANA,1285,1285,13069,åè©,ä¸è¬,*,*,*,*,*";
-
- unkDic.putCharacterCategory(0, "KANJI");
- unkDic.putCharacterCategory(1, "ALPHA");
- unkDic.putCharacterCategory(2, "HIRAGANA");
-
- unkDic.put(CSVUtil.parse(entry1));
- unkDic.put(CSVUtil.parse(entry2));
- unkDic.put(CSVUtil.parse(entry3));
- }
-
- private UnknownDictionary createDictionary() throws IOException {
- InputStream is = this.getClass().getClassLoader().getResourceAsStream("unk.def.utf-8");
- UnknownDictionary dictionary = new UnknownDictionary();
- BufferedReader reader = new BufferedReader(new InputStreamReader(is));
-
- String line = null;
- while((line = reader.readLine()) != null) {
- dictionary.put(CSVUtil.parse(line));
- }
- reader.close();
-
- is = this.getClass().getClassLoader().getResourceAsStream("char.def.utf-8");
- reader = new BufferedReader(new InputStreamReader(is));
-
- line = null;
- while ((line = reader.readLine()) != null) {
- line = line.replaceAll("^\\s", "");
- line = line.replaceAll("\\s*#.*", "");
- line = line.replaceAll("\\s+", " ");
-
- // Skip empty line or comment line
- if(line.length() == 0) {
- continue;
- }
-
- if(line.startsWith("0x")) { // Category mapping
- String[] values = line.split(" ", 2); // Split only first space
-
- if(!values[0].contains("..")) {
- int cp = Integer.decode(values[0]).intValue();
- dictionary.putCharacterCategory(cp, values[1]);
- } else {
- String[] codePoints = values[0].split("\\.\\.");
- int cpFrom = Integer.decode(codePoints[0]).intValue();
- int cpTo = Integer.decode(codePoints[1]).intValue();
-
- for(int i = cpFrom; i <= cpTo; i++){
- dictionary.putCharacterCategory(i, values[1]);
- }
- }
- } else { // Invoke definition
- String[] values = line.split(" "); // Consecutive space is merged above
- String characterClassName = values[0];
- int invoke = Integer.parseInt(values[1]);
- int group = Integer.parseInt(values[2]);
- int length = Integer.parseInt(values[3]);
- dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
- }
-
- }
-
- reader.close();
-
- return dictionary;
- }
+ public static final String FILENAME = "unk-tokeninfo-dict.obj";
+
+ @Test
+ public void testPutCharacterCategory() {
+ UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
+
+ try{
+ unkDic.putCharacterCategory(0, "DUMMY_NAME");
+ fail();
+ } catch(Exception e) {
+
+ }
+
+ try{
+ unkDic.putCharacterCategory(-1, "KATAKANA");
+ fail();
+ } catch(Exception e) {
+
+ }
+
+ unkDic.putCharacterCategory(0, "DEFAULT");
+ unkDic.putCharacterCategory(1, "GREEK");
+ unkDic.putCharacterCategory(2, "HIRAGANA");
+ unkDic.putCharacterCategory(3, "KATAKANA");
+ unkDic.putCharacterCategory(4, "KANJI");
+ }
+
+ @Test
+ public void testPut() {
+ UnknownDictionary unkDic = new UnknownDictionary(10 * 1024 * 1024);
+ try{
+ unkDic.put(CSVUtil.parse("KANJI,1285,11426,åè©,ä¸è¬,*,*,*,*,*"));
+ fail();
+ } catch(Exception e){
+
+ }
+
+ String entry1 = "KANJI,1285,1285,11426,åè©,ä¸è¬,*,*,*,*,*";
+ String entry2 = "ALPHA,1285,1285,13398,åè©,ä¸è¬,*,*,*,*,*";
+ String entry3 = "HIRAGANA,1285,1285,13069,åè©,ä¸è¬,*,*,*,*,*";
+
+ unkDic.putCharacterCategory(0, "KANJI");
+ unkDic.putCharacterCategory(1, "ALPHA");
+ unkDic.putCharacterCategory(2, "HIRAGANA");
+
+ unkDic.put(CSVUtil.parse(entry1));
+ unkDic.put(CSVUtil.parse(entry2));
+ unkDic.put(CSVUtil.parse(entry3));
+ }
+
+ private UnknownDictionary createDictionary() throws IOException {
+ InputStream is = this.getClass().getClassLoader().getResourceAsStream("unk.def.utf-8");
+ UnknownDictionary dictionary = new UnknownDictionary();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+
+ String line = null;
+ while((line = reader.readLine()) != null) {
+ dictionary.put(CSVUtil.parse(line));
+ }
+ reader.close();
+
+ is = this.getClass().getClassLoader().getResourceAsStream("char.def.utf-8");
+ reader = new BufferedReader(new InputStreamReader(is));
+
+ line = null;
+ while ((line = reader.readLine()) != null) {
+ line = line.replaceAll("^\\s", "");
+ line = line.replaceAll("\\s*#.*", "");
+ line = line.replaceAll("\\s+", " ");
+
+ // Skip empty line or comment line
+ if(line.length() == 0) {
+ continue;
+ }
+
+ if(line.startsWith("0x")) { // Category mapping
+ String[] values = line.split(" ", 2); // Split only first space
+
+ if(!values[0].contains("..")) {
+ int cp = Integer.decode(values[0]).intValue();
+ dictionary.putCharacterCategory(cp, values[1]);
+ } else {
+ String[] codePoints = values[0].split("\\.\\.");
+ int cpFrom = Integer.decode(codePoints[0]).intValue();
+ int cpTo = Integer.decode(codePoints[1]).intValue();
+
+ for(int i = cpFrom; i <= cpTo; i++){
+ dictionary.putCharacterCategory(i, values[1]);
+ }
+ }
+ } else { // Invoke definition
+ String[] values = line.split(" "); // Consecutive space is merged above
+ String characterClassName = values[0];
+ int invoke = Integer.parseInt(values[1]);
+ int group = Integer.parseInt(values[2]);
+ int length = Integer.parseInt(values[3]);
+ dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
+ }
+
+ }
+
+ reader.close();
+
+ return dictionary;
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Tue Jan 3 04:33:56 2012
@@ -25,53 +25,53 @@ import org.apache.lucene.util.LuceneTest
import org.junit.Test;
public class UserDictionaryTest extends LuceneTestCase {
-
- @Test
- public void testLookup() throws IOException {
-
- UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
- int[][] dictionaryEntryResult = dictionary.lookup("é¢è¥¿å½é空港ã«è¡ã£ã");
- // Length should be three é¢è¥¿, å½é, 空港
- assertEquals(3, dictionaryEntryResult.length);
-
- // Test positions
- assertEquals(0, dictionaryEntryResult[0][1]); // index of é¢è¥¿
- assertEquals(2, dictionaryEntryResult[1][1]); // index of å½é
- assertEquals(4, dictionaryEntryResult[2][1]); // index of 空港
-
- // Test lengths
- assertEquals(2, dictionaryEntryResult[0][2]); // length of é¢è¥¿
- assertEquals(2, dictionaryEntryResult[1][2]); // length of å½é
- assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
-
- int[][] dictionaryEntryResult2 = dictionary.lookup("é¢è¥¿å½é空港ã¨é¢è¥¿å½é空港ã«è¡ã£ã");
- // Length should be six
- assertEquals(6, dictionaryEntryResult2.length);
- }
-
- @Test
- public void testReadings() throws IOException {
+
+ @Test
+ public void testLookup() throws IOException {
+
UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
- int wordIdNihon = 100000000; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
- assertEquals("ããã³", dictionary.getReading(wordIdNihon));
-
- int wordIdAsashoryu = 100000006; // wordId for æéé¾
- assertEquals("ã¢ãµã·ã§ã¦ãªã¥ã¦", dictionary.getReading(wordIdAsashoryu));
-
- int wordIdNotExist = 1;
- assertNull(dictionary.getReading(wordIdNotExist));
- }
-
- @Test
- public void testPartOfSpeech() throws IOException {
+ int[][] dictionaryEntryResult = dictionary.lookup("é¢è¥¿å½é空港ã«è¡ã£ã");
+ // Length should be three é¢è¥¿, å½é, 空港
+ assertEquals(3, dictionaryEntryResult.length);
+
+ // Test positions
+ assertEquals(0, dictionaryEntryResult[0][1]); // index of é¢è¥¿
+ assertEquals(2, dictionaryEntryResult[1][1]); // index of å½é
+ assertEquals(4, dictionaryEntryResult[2][1]); // index of 空港
+
+ // Test lengths
+ assertEquals(2, dictionaryEntryResult[0][2]); // length of é¢è¥¿
+ assertEquals(2, dictionaryEntryResult[1][2]); // length of å½é
+ assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
+
+ int[][] dictionaryEntryResult2 = dictionary.lookup("é¢è¥¿å½é空港ã¨é¢è¥¿å½é空港ã«è¡ã£ã");
+ // Length should be six
+ assertEquals(6, dictionaryEntryResult2.length);
+ }
+
+ @Test
+ public void testReadings() throws IOException {
+ UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
+ int wordIdNihon = 100000000; // wordId of æ¥æ¬ in æ¥æ¬çµæ¸æ°è
+ assertEquals("ããã³", dictionary.getReading(wordIdNihon));
+
+ int wordIdAsashoryu = 100000006; // wordId for æéé¾
+ assertEquals("ã¢ãµã·ã§ã¦ãªã¥ã¦", dictionary.getReading(wordIdAsashoryu));
+
+ int wordIdNotExist = 1;
+ assertNull(dictionary.getReading(wordIdNotExist));
+ }
+
+ @Test
+ public void testPartOfSpeech() throws IOException {
UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
- int wordIdKeizai = 100000001; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
- assertEquals("ã«ã¹ã¿ã åè©", dictionary.getPartOfSpeech(wordIdKeizai));
- }
-
- @Test
- public void testRead() throws IOException {
+ int wordIdKeizai = 100000001; // wordId of çµæ¸ in æ¥æ¬çµæ¸æ°è
+ assertEquals("ã«ã¹ã¿ã åè©", dictionary.getPartOfSpeech(wordIdKeizai));
+ }
+
+ @Test
+ public void testRead() throws IOException {
UserDictionary dictionary = UserDictionary.read( TokenizerTest.class.getResourceAsStream("userdict.txt"));
- assertNotNull(dictionary);
- }
+ assertNotNull(dictionary);
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrieTest.java Tue Jan 3 04:33:56 2012
@@ -27,76 +27,76 @@ import org.apache.lucene.util.LuceneTest
import org.junit.Test;
public class DoubleArrayTrieTest extends LuceneTestCase {
-
- @Test
- public void testBuild() {
- Trie trie = getTrie();
- DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
- doubleArrayTrie.build(trie);
- }
-
- @Test
- public void testWrite() throws IOException {
- Trie trie = getTrie();
-
- DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
- doubleArrayTrie.build(trie);
-
- try{
- doubleArrayTrie.write("/some/path/which/is/not/exist");
- fail();
- }catch(IOException e){
-
- }
-
- // nocommit: lets use TEMPDIR here
- String tmpDir = System.getProperty("java.io.tmpdir");
- File dir = new File(tmpDir + File.separator + "datmp");
- dir.mkdir();
- doubleArrayTrie.write(dir.getCanonicalPath());
- dir.deleteOnExit();
- for(File file : dir.listFiles()) {
- file.deleteOnExit();
- }
-
- assertTrue(dir.length() > 0);
-
- }
-
- @Test
- public void testLookup() throws IOException {
- Trie trie = getTrie();
-
- DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
- doubleArrayTrie.build(trie);
-
+
+ @Test
+ public void testBuild() {
+ Trie trie = getTrie();
+ DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
+ doubleArrayTrie.build(trie);
+ }
+
+ @Test
+ public void testWrite() throws IOException {
+ Trie trie = getTrie();
+
+ DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
+ doubleArrayTrie.build(trie);
+
+ try{
+ doubleArrayTrie.write("/some/path/which/is/not/exist");
+ fail();
+ }catch(IOException e){
+
+ }
+
// nocommit: lets use TEMPDIR here
- String tmpDir = System.getProperty("java.io.tmpdir");
- File dir = new File(tmpDir + File.separator + "datmp");
- dir.mkdir();
- doubleArrayTrie.write(dir.getCanonicalPath());
- dir.deleteOnExit();
- for(File file : dir.listFiles()) {
- file.deleteOnExit();
- }
-
- doubleArrayTrie = DoubleArrayTrie.read(new FileInputStream(dir.getCanonicalPath() + File.separator + DoubleArrayTrie.FILENAME));
-
- assertEquals(0, doubleArrayTrie.lookup("a"));
- assertTrue(doubleArrayTrie.lookup("abc") > 0);
- assertTrue(doubleArrayTrie.lookup("ããã") > 0);
- assertTrue(doubleArrayTrie.lookup("xyz") < 0);
-
- }
-
- private Trie getTrie() {
- Trie trie = new Trie();
- trie.add("abc");
- trie.add("abd");
- trie.add("ããã");
- trie.add("ããã");
- return trie;
- }
-
-
+ String tmpDir = System.getProperty("java.io.tmpdir");
+ File dir = new File(tmpDir + File.separator + "datmp");
+ dir.mkdir();
+ doubleArrayTrie.write(dir.getCanonicalPath());
+ dir.deleteOnExit();
+ for(File file : dir.listFiles()) {
+ file.deleteOnExit();
+ }
+
+ assertTrue(dir.length() > 0);
+
+ }
+
+ @Test
+ public void testLookup() throws IOException {
+ Trie trie = getTrie();
+
+ DoubleArrayTrie doubleArrayTrie = new DoubleArrayTrie();
+ doubleArrayTrie.build(trie);
+
+ // nocommit: lets use TEMPDIR here
+ String tmpDir = System.getProperty("java.io.tmpdir");
+ File dir = new File(tmpDir + File.separator + "datmp");
+ dir.mkdir();
+ doubleArrayTrie.write(dir.getCanonicalPath());
+ dir.deleteOnExit();
+ for(File file : dir.listFiles()) {
+ file.deleteOnExit();
+ }
+
+ doubleArrayTrie = DoubleArrayTrie.read(new FileInputStream(dir.getCanonicalPath() + File.separator + DoubleArrayTrie.FILENAME));
+
+ assertEquals(0, doubleArrayTrie.lookup("a"));
+ assertTrue(doubleArrayTrie.lookup("abc") > 0);
+ assertTrue(doubleArrayTrie.lookup("ããã") > 0);
+ assertTrue(doubleArrayTrie.lookup("xyz") < 0);
+
+ }
+
+ private Trie getTrie() {
+ Trie trie = new Trie();
+ trie.add("abc");
+ trie.add("abd");
+ trie.add("ããã");
+ trie.add("ããã");
+ return trie;
+ }
+
+
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/NodeTest.java Tue Jan 3 04:33:56 2012
@@ -23,137 +23,137 @@ import org.apache.lucene.util.LuceneTest
import org.junit.Test;
public class NodeTest extends LuceneTestCase {
-
- @Test
- public void testNode() {
- Trie trie = new Trie();
-
- Node node = trie.new Node('!');
- assertEquals('!', node.getKey());
-
- node = trie.new Node('1');
- assertEquals('1', node.getKey());
-
- node = trie.new Node('a');
- assertEquals('a', node.getKey());
-
- node = trie.new Node('ï¼');
- assertEquals('ï¼', node.getKey());
-
- node = trie.new Node('ï¼');
- assertEquals('ï¼', node.getKey());
-
- node = trie.new Node('ã');
- assertEquals('ã', node.getKey());
-
- node = trie.new Node('æ¼¢');
- assertEquals('æ¼¢', node.getKey());
-
- }
-
- @Test
- public void testAddChild() {
- Trie trie = new Trie();
- Node node = trie.new Node('a');
-
- Node returnedNode = node.addChild(trie.new Node('b'));
- assertEquals('b', returnedNode.getKey());
- assertEquals(1, node.getChildren().length);
- assertEquals('b', node.getChildren()[0].getKey());
-
- returnedNode = node.addChild(trie.new Node('c'));
- assertEquals('c', returnedNode.getKey());
- assertEquals(2, node.getChildren().length);
- assertEquals('c', node.getChildren()[1].getKey());
- }
-
- @Test
- public void testAdd() {
- Trie trie = new Trie();
-
- Node node = trie.new Node('a');
- node.add("");
- assertEquals(0, node.getChildren().length);
-
- node = trie.new Node('a');
- node.add("b");
- assertEquals(1, node.getChildren().length);
- assertEquals('b', node.getChildren()[0].getKey());
-
- node = trie.new Node('a');
- node.add("bc");
- Node b = node.getChildren()[0];
- assertEquals(1, node.getChildren().length);
- assertEquals('b', b.getKey());
- assertEquals(1, b.getChildren().length);
- Node c = b.getChildren()[0];
- assertEquals('c', c.getKey());
- assertEquals(0, c.getChildren().length);
-
- node.add("bd");
- b = node.getChildren()[0];
- assertEquals(1, node.getChildren().length);
- assertEquals('b', b.getKey());
- assertEquals(2, b.getChildren().length);
- c = b.getChildren()[0];
- assertEquals('c', c.getKey());
- assertEquals(0, c.getChildren().length);
- Node d = b.getChildren()[1];
- assertEquals('d', d.getKey());
- assertEquals(0, d.getChildren().length);
- }
-
-
- @Test
- public void testGetkey() {
- Trie trie = new Trie();
-
- Node node = trie.new Node('!');
- assertEquals('!', node.getKey());
-
- node = trie.new Node('1');
- assertEquals('1', node.getKey());
-
- node = trie.new Node('a');
- assertEquals('a', node.getKey());
-
- node = trie.new Node('ï¼');
- assertEquals('ï¼', node.getKey());
-
- node = trie.new Node('ï¼');
- assertEquals('ï¼', node.getKey());
-
- node = trie.new Node('ã');
- assertEquals('ã', node.getKey());
-
- node = trie.new Node('æ¼¢');
- assertEquals('æ¼¢', node.getKey());
- }
-
- @Test
- public void testHasSinglePath() {
- Trie trie = new Trie();
-
- Node node = trie.new Node('a');
- node.add("bcd");
- assertEquals(true, node.hasSinglePath());
-
- node.add("bce");
- assertEquals(false, node.hasSinglePath());
- }
-
- @Test
- public void testGetChildren() {
- Trie trie = new Trie();
-
- Node node = trie.new Node('a');
- node.add("bcd");
- node.add("bde");
- node.add("xyz");
-
- assertEquals(2, node.getChildren().length);
- assertEquals('b', node.getChildren()[0].getKey());
- assertEquals('x', node.getChildren()[1].getKey());
-
- }
+
+ @Test
+ public void testNode() {
+ Trie trie = new Trie();
+
+ Node node = trie.new Node('!');
+ assertEquals('!', node.getKey());
+
+ node = trie.new Node('1');
+ assertEquals('1', node.getKey());
+
+ node = trie.new Node('a');
+ assertEquals('a', node.getKey());
+
+ node = trie.new Node('ï¼');
+ assertEquals('ï¼', node.getKey());
+
+ node = trie.new Node('ï¼');
+ assertEquals('ï¼', node.getKey());
+
+ node = trie.new Node('ã');
+ assertEquals('ã', node.getKey());
+
+ node = trie.new Node('æ¼¢');
+ assertEquals('æ¼¢', node.getKey());
+
+ }
+
+ @Test
+ public void testAddChild() {
+ Trie trie = new Trie();
+ Node node = trie.new Node('a');
+
+ Node returnedNode = node.addChild(trie.new Node('b'));
+ assertEquals('b', returnedNode.getKey());
+ assertEquals(1, node.getChildren().length);
+ assertEquals('b', node.getChildren()[0].getKey());
+
+ returnedNode = node.addChild(trie.new Node('c'));
+ assertEquals('c', returnedNode.getKey());
+ assertEquals(2, node.getChildren().length);
+ assertEquals('c', node.getChildren()[1].getKey());
+ }
+
+ @Test
+ public void testAdd() {
+ Trie trie = new Trie();
+
+ Node node = trie.new Node('a');
+ node.add("");
+ assertEquals(0, node.getChildren().length);
+
+ node = trie.new Node('a');
+ node.add("b");
+ assertEquals(1, node.getChildren().length);
+ assertEquals('b', node.getChildren()[0].getKey());
+
+ node = trie.new Node('a');
+ node.add("bc");
+ Node b = node.getChildren()[0];
+ assertEquals(1, node.getChildren().length);
+ assertEquals('b', b.getKey());
+ assertEquals(1, b.getChildren().length);
+ Node c = b.getChildren()[0];
+ assertEquals('c', c.getKey());
+ assertEquals(0, c.getChildren().length);
+
+ node.add("bd");
+ b = node.getChildren()[0];
+ assertEquals(1, node.getChildren().length);
+ assertEquals('b', b.getKey());
+ assertEquals(2, b.getChildren().length);
+ c = b.getChildren()[0];
+ assertEquals('c', c.getKey());
+ assertEquals(0, c.getChildren().length);
+ Node d = b.getChildren()[1];
+ assertEquals('d', d.getKey());
+ assertEquals(0, d.getChildren().length);
+ }
+
+
+ @Test
+ public void testGetkey() {
+ Trie trie = new Trie();
+
+ Node node = trie.new Node('!');
+ assertEquals('!', node.getKey());
+
+ node = trie.new Node('1');
+ assertEquals('1', node.getKey());
+
+ node = trie.new Node('a');
+ assertEquals('a', node.getKey());
+
+ node = trie.new Node('ï¼');
+ assertEquals('ï¼', node.getKey());
+
+ node = trie.new Node('ï¼');
+ assertEquals('ï¼', node.getKey());
+
+ node = trie.new Node('ã');
+ assertEquals('ã', node.getKey());
+
+ node = trie.new Node('æ¼¢');
+ assertEquals('æ¼¢', node.getKey());
+ }
+
+ @Test
+ public void testHasSinglePath() {
+ Trie trie = new Trie();
+
+ Node node = trie.new Node('a');
+ node.add("bcd");
+ assertEquals(true, node.hasSinglePath());
+
+ node.add("bce");
+ assertEquals(false, node.hasSinglePath());
+ }
+
+ @Test
+ public void testGetChildren() {
+ Trie trie = new Trie();
+
+ Node node = trie.new Node('a');
+ node.add("bcd");
+ node.add("bde");
+ node.add("xyz");
+
+ assertEquals(2, node.getChildren().length);
+ assertEquals('b', node.getChildren()[0].getKey());
+ assertEquals('x', node.getChildren()[1].getKey());
+
+ }
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java?rev=1226637&r1=1226636&r2=1226637&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/trie/TrieTest.java Tue Jan 3 04:33:56 2012
@@ -23,50 +23,50 @@ import org.apache.lucene.util.LuceneTest
import org.junit.Test;
public class TrieTest extends LuceneTestCase {
-
- @Test
- public void testGetRoot() {
- Trie trie = new Trie();
- Node rootNode = trie.getRoot();
- assertNotNull(rootNode);
- }
-
- @Test
- public void testAdd() {
- Trie trie = new Trie();
- trie.add("aa");
- trie.add("ab");
- trie.add("bb");
-
- Node rootNode = trie.getRoot();
- assertEquals(2, rootNode.getChildren().length);
- assertEquals(2, rootNode.getChildren()[0].getChildren().length);
- assertEquals(1, rootNode.getChildren()[1].getChildren().length);
- }
-
- @Test
- public void testGetChildren() {
- Trie trie = new Trie();
- trie.add("aa");
- trie.add("ab");
- trie.add("bb");
-
- Node rootNode = trie.getRoot();
- assertEquals(2, rootNode.getChildren().length);
- assertEquals(2, rootNode.getChildren()[0].getChildren().length);
- assertEquals(1, rootNode.getChildren()[1].getChildren().length);
- }
-
- @Test
- public void testSinglePath() {
- Trie trie = new Trie();
- assertTrue(trie.getRoot().hasSinglePath());
- trie.add("abcdef");
- assertTrue(trie.getRoot().hasSinglePath());
- trie.add("abdfg");
- Node rootNode = trie.getRoot();
- assertEquals(2, rootNode.getChildren()[0].getChildren()[0].getChildren().length);
- assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[0].hasSinglePath());
- assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[1].hasSinglePath());
- }
+
+ @Test
+ public void testGetRoot() {
+ Trie trie = new Trie();
+ Node rootNode = trie.getRoot();
+ assertNotNull(rootNode);
+ }
+
+ @Test
+ public void testAdd() {
+ Trie trie = new Trie();
+ trie.add("aa");
+ trie.add("ab");
+ trie.add("bb");
+
+ Node rootNode = trie.getRoot();
+ assertEquals(2, rootNode.getChildren().length);
+ assertEquals(2, rootNode.getChildren()[0].getChildren().length);
+ assertEquals(1, rootNode.getChildren()[1].getChildren().length);
+ }
+
+ @Test
+ public void testGetChildren() {
+ Trie trie = new Trie();
+ trie.add("aa");
+ trie.add("ab");
+ trie.add("bb");
+
+ Node rootNode = trie.getRoot();
+ assertEquals(2, rootNode.getChildren().length);
+ assertEquals(2, rootNode.getChildren()[0].getChildren().length);
+ assertEquals(1, rootNode.getChildren()[1].getChildren().length);
+ }
+
+ @Test
+ public void testSinglePath() {
+ Trie trie = new Trie();
+ assertTrue(trie.getRoot().hasSinglePath());
+ trie.add("abcdef");
+ assertTrue(trie.getRoot().hasSinglePath());
+ trie.add("abdfg");
+ Node rootNode = trie.getRoot();
+ assertEquals(2, rootNode.getChildren()[0].getChildren()[0].getChildren().length);
+ assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[0].hasSinglePath());
+ assertTrue(rootNode.getChildren()[0].getChildren()[0].getChildren()[1].hasSinglePath());
+ }
}