You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/01/11 11:12:14 UTC
svn commit: r1229948 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
java/org/apache/lucene/analysis/kuromoji/viterbi/ resources/org/apache/luc...
Author: uschindler
Date: Wed Jan 11 10:12:13 2012
New Revision: 1229948
URL: http://svn.apache.org/viewvc?rev=1229948&view=rev
Log:
LUCENE-3305: apply wordid=0 fix (port of fix from kuromoji 0.7.7-SNAPSHOT on Github).
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java Wed Jan 11 10:12:13 2012
@@ -166,7 +166,7 @@ public class Segmenter {
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
- if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS
+ if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
continue;
} else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
continue; // Do not emit punctuation
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java Wed Jan 11 10:12:13 2012
@@ -65,14 +65,7 @@ public final class ConnectionCosts {
}
public int get(int forwardId, int backwardId) {
- // FIXME: There seems to be something wrong with the double array trie in some rare
- // cases causing and IndexOutOfBoundsException. Use a guard as a temporary work-around
- // and return a high cost to advise Mr. Viterbi strongly to not use this transition
- if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
- return costs[backwardId][forwardId];
- } else {
- return 50000;
- }
+ return costs[backwardId][forwardId];
}
public static ConnectionCosts getInstance() {
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Wed Jan 11 10:12:13 2012
@@ -206,7 +206,7 @@ public class Viterbi {
int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr
FST.Arc<Long> arc = new FST.Arc<Long>();
- ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
+ ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
// Process user dictionary;
@@ -277,7 +277,7 @@ public class Viterbi {
}
}
- ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
+ ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Wed Jan 11 10:12:13 2012
@@ -175,7 +175,21 @@ public class SegmenterTest extends Lucen
assertEquals("å©åè©", tokens.get(7).getPartOfSpeech());
assertEquals("è¨å·-å¥ç¹", tokens.get(8).getPartOfSpeech());
}
-
+
+ // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
+ // do we have a possibility to actually lookup the first and last word from dictionary?
+ public void testYabottai() {
+ List<Token> tokens = segmenter.tokenize("ãã¼ã£ãã");
+ assertEquals(1, tokens.size());
+ assertEquals("ãã¼ã£ãã", tokens.get(0).getSurfaceFormString());
+ }
+
+ public void testTsukitosha() {
+ List<Token> tokens = segmenter.tokenize("çªãéãã");
+ assertEquals(1, tokens.size());
+ assertEquals("çªãéãã", tokens.get(0).getSurfaceFormString());
+ }
+
public void testBocchan() throws Exception {
doTestBocchan(1);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Wed Jan 11 10:12:13 2012
@@ -45,7 +45,7 @@ import com.ibm.icu.text.Normalizer2;
public class TokenInfoDictionaryBuilder {
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
- private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
+ private int offset = 0;
private String encoding = "euc-jp";