You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/01/11 11:12:14 UTC
svn commit: r1229948 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/viterbi/ resources/org/apache/luc...

Author: uschindler
Date: Wed Jan 11 10:12:13 2012
New Revision: 1229948

URL: http://svn.apache.org/viewvc?rev=1229948&view=rev
Log:
LUCENE-3305: apply wordid=0 fix (port of fix from kuromoji 0.7.7-SNAPSHOT on Github).

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java Wed Jan 11 10:12:13 2012
@@ -166,7 +166,7 @@ public class Segmenter {
     List<ViterbiNode> bestPath = viterbi.search(lattice);
     for (ViterbiNode node : bestPath) {
       int wordId = node.getWordId();
-      if (node.getType() == Type.KNOWN && wordId == 0){ // Do not include BOS/EOS 
+      if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS 
         continue;
       } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
         continue; // Do not emit punctuation

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.java Wed Jan 11 10:12:13 2012
@@ -65,14 +65,7 @@ public final class ConnectionCosts {
   }
   
   public int get(int forwardId, int backwardId) {
-    // FIXME: There seems to be something wrong with the double array trie in some rare
-    // cases causing and IndexOutOfBoundsException.  Use a guard as a temporary work-around
-    // and return a high cost to advise Mr. Viterbi strongly to not use this transition
-    if (backwardId < costs.length && forwardId < costs[backwardId].length ) {
-      return costs[backwardId][forwardId];
-    } else {
-      return 50000;
-    }
+    return costs[backwardId][forwardId];
   }
   
   public static ConnectionCosts getInstance() {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Wed Jan 11 10:12:13 2012
@@ -206,7 +206,7 @@ public class Viterbi {
     int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
     int[] endSizeArr = new int[length + 2];   // array to keep ViterbiNode count in endIndexArr
     FST.Arc<Long> arc = new FST.Arc<Long>();
-    ViterbiNode bosNode = new ViterbiNode(0, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
+    ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
     addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
     
     // Process user dictionary;
@@ -277,7 +277,7 @@ public class Viterbi {
       }
     }
     
-    ViterbiNode eosNode = new ViterbiNode(0, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
+    ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
     addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
     
     ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Wed Jan 11 10:12:13 2012
@@ -175,7 +175,21 @@ public class SegmenterTest extends Lucen
     assertEquals("å©åè©",       tokens.get(7).getPartOfSpeech());
     assertEquals("è¨å·-å¥ç¹",      tokens.get(8).getPartOfSpeech());
   }
-  
+
+  // TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
+  // do we have a possibility to actually lookup the first and last word from dictionary?
+  public void testYabottai() {
+    List<Token> tokens = segmenter.tokenize("ãã¼ã£ãã");
+    assertEquals(1, tokens.size());
+    assertEquals("ãã¼ã£ãã", tokens.get(0).getSurfaceFormString());
+  }
+
+  public void testTsukitosha() {
+    List<Token> tokens = segmenter.tokenize("çªãéãã");
+    assertEquals(1, tokens.size());
+    assertEquals("çªãéãã", tokens.get(0).getSurfaceFormString());
+  }
+
   public void testBocchan() throws Exception {
     doTestBocchan(1);
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1229948&r1=1229947&r2=1229948&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Wed Jan 11 10:12:13 2012
@@ -45,7 +45,7 @@ import com.ibm.icu.text.Normalizer2;
 public class TokenInfoDictionaryBuilder {
   
   /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
-  private int offset = 4; // Start from 4. First 4 bytes are used to store size of dictionary file.
+  private int offset = 0;
   
   private String encoding = "euc-jp";