You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/09 09:10:38 UTC
svn commit: r1229052 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ resources/org/apache/lucene/analysis/kuromoji/dict/ test/org/apache/lucene...

Author: rmuir
Date: Mon Jan  9 08:10:37 2012
New Revision: 1229052

URL: http://svn.apache.org/viewvc?rev=1229052&view=rev
Log:
LUCENE-3305: add inflection data

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat   (with props)
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Mon Jan  9 08:10:37 2012
@@ -93,6 +93,20 @@ public class Token {
   }
   
   /**
+   * @return inflection type or null
+   */
+  public String getInflectionType() {
+    return dictionary.getInflectionType(wordId);
+  }
+  
+  /**
+   * @return inflection form or null
+   */
+  public String getInflectionForm() {
+    return dictionary.getInflectionForm(wordId);
+  }
+  
+  /**
    * @return base form or null if token is not inflected
    */
   public String getBaseForm() {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Mon Jan  9 08:10:37 2012
@@ -37,21 +37,27 @@ public abstract class BinaryDictionary i
   public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
   public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
   public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
+  public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
   
   public static final String DICT_HEADER = "kuromoji_dict";
   public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
   public static final String POSDICT_HEADER = "kuromoji_dict_pos";
+  public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
   public static final int VERSION = 1;
   
   private final ByteBuffer buffer;
   private final int[] targetMapOffsets, targetMap;
   private final String[] posDict;
+  private final String[] inflTypeDict;
+  private final String[] inflFormDict;
   
   protected BinaryDictionary() throws IOException {
-    InputStream mapIS = null, dictIS = null, posIS = null;
+    InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
     IOException priorE = null;
     int[] targetMapOffsets = null, targetMap = null;
     String[] posDict = null;
+    String[] inflFormDict = null;
+    String[] inflTypeDict = null;
     ByteBuffer buffer = null;
     try {
       mapIS = getClass().getResourceAsStream(getClass().getSimpleName() + TARGETMAP_FILENAME_SUFFIX);
@@ -86,6 +92,20 @@ public abstract class BinaryDictionary i
       for (int j = 0; j < posDict.length; j++) {
         posDict[j] = in.readString();
       }
+      
+      inflIS = getClass().getResourceAsStream(getClass().getSimpleName() + INFLDICT_FILENAME_SUFFIX);
+      if (inflIS == null)
+        throw new FileNotFoundException("Not in classpath: " + getClass().getName().replace('.','/') + INFLDICT_FILENAME_SUFFIX);
+      inflIS = new BufferedInputStream(inflIS);
+      in = new InputStreamDataInput(inflIS);
+      CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
+      int length = in.readVInt();
+      inflTypeDict = new String[length];
+      inflFormDict = new String[length];
+      for (int j = 0; j < length; j++) {
+        inflTypeDict[j] = in.readString();
+        inflFormDict[j] = in.readString();
+      }
 
       dictIS = getClass().getResourceAsStream(getClass().getSimpleName() + DICT_FILENAME_SUFFIX);
       if (dictIS == null)
@@ -103,12 +123,14 @@ public abstract class BinaryDictionary i
     } catch (IOException ioe) {
       priorE = ioe;
     } finally {
-      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
+      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
     }
     
     this.targetMap = targetMap;
     this.targetMapOffsets = targetMapOffsets;
     this.posDict = posDict;
+    this.inflTypeDict = inflTypeDict;
+    this.inflFormDict = inflFormDict;
     this.buffer = buffer;
   }
   
@@ -134,64 +156,127 @@ public abstract class BinaryDictionary i
     return buffer.getShort(wordId + 4);	// Skip left id and right id
   }
 
-  private String readString(int offset, int length, boolean kana) {
-    char text[] = new char[length];
-    if (kana) {
-      for (int i = 0; i < length; i++) {
-        text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
-      }
+  @Override
+  public String getBaseForm(int wordId) {
+    int offset = baseFormOffset(wordId);
+    int length = (buffer.get(offset++) & 0xff) >>> 1;
+    if (length == 0) {
+      return null; // same as surface form
     } else {
-      for (int i = 0; i < length; i++) {
-        text[i] = buffer.getChar(offset + (i << 1));
-      }
+      return readString(offset, length, false);
     }
-    return new String(text);
   }
   
   @Override
   public String getReading(int wordId) {
-    int offset = wordId + 7;
-    int baseFormLength = buffer.get(offset++) & 0xff;
-    offset += baseFormLength << 1;
+    int offset = readingOffset(wordId);
     int readingData = buffer.get(offset++) & 0xff;
     return readString(offset, readingData >>> 1, (readingData & 1) == 1);
   }
   
   @Override
+  public String getPartOfSpeech(int wordId) {
+    int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
+    return posDict[posIndex >>> 1];
+  }
+  
+  @Override
   public String getPronunciation(int wordId) {
-    int offset = wordId + 7;
-    int baseFormLength = buffer.get(offset++) & 0xff;
-    offset += baseFormLength << 1;
-    int readingData = buffer.get(offset++) & 0xff;
-    int readingLength = readingData >>> 1;
-    int readingOffset = offset;
-    if ((readingData & 1) == 0) {
-      offset += readingLength << 1;
+    if (hasPronunciationData(wordId)) {
+      int offset = pronunciationOffset(wordId);
+      int pronunciationData = buffer.get(offset++) & 0xff;
+      return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
     } else {
-      offset += readingLength;
+      return getReading(wordId); // same as the reading
     }
-    int pronunciationData = buffer.get(offset++) & 0xff;
-    if (pronunciationData == 0) {
-      return readString(readingOffset, readingLength, (readingData & 1) == 1); 
+  }
+  
+  @Override
+  public String getInflectionType(int wordId) {
+    int index = getInflectionIndex(wordId);
+    return index < 0 ? null : inflTypeDict[index];
+  }
+
+  @Override
+  public String getInflectionForm(int wordId) {
+    int index = getInflectionIndex(wordId);
+    return index < 0 ? null : inflFormDict[index];
+  }
+  
+  private static int posOffset(int wordId) {
+    return wordId + 6;
+  }
+  
+  private static int baseFormOffset(int wordId) {
+    return wordId + 7;
+  }
+  
+  private int readingOffset(int wordId) {
+    int offset = baseFormOffset(wordId);
+    int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
+    return offset + baseFormLength;
+  }
+  
+  private int pronunciationOffset(int wordId) {
+    int offset = readingOffset(wordId);
+    int readingData = buffer.get(offset++) & 0xff;
+    final int readingLength;
+    if ((readingData & 1) == 0) {
+      readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
     } else {
-      return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
+      readingLength = readingData >>> 1;
     }
+    return offset + readingLength;
   }
   
-  @Override
-  public String getPartOfSpeech(int wordId) {
-    int posIndex = buffer.get(wordId + 6) & 0xff; // read index into posDict
-    return posDict[posIndex];
+  private boolean hasPronunciationData(int wordId) {
+    int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
+    return (baseFormData & 1) == 0;
   }
   
-  @Override
-  public String getBaseForm(int wordId) {
-    int offset = wordId + 7;
-    int length = buffer.get(offset++) & 0xff;
-    if (length == 0) {
-      return null; // same as surface form
+  private boolean hasInflectionData(int wordId) {
+    int posData = buffer.get(posOffset(wordId)) & 0xff;
+    return (posData & 1) == 1;
+  }
+  
+  private int getInflectionIndex(int wordId) {
+    if (!hasInflectionData(wordId)) {
+      return -1; // common case: no inflection data
+    }
+    
+    // skip past reading/pronunciation at the end
+    int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
+    int endData = buffer.get(offset++) & 0xff;
+    
+    final int endLength;
+    if ((endData & 1) == 0) {
+      endLength = endData & 0xfe; // UTF-16: mask off kana bit
     } else {
-      return readString(offset, length, false);
+      endLength = endData >>> 1;
     }
+    
+    offset += endLength;
+    
+    byte b = buffer.get(offset++);
+    int i = b & 0x7F;
+    if ((b & 0x80) == 0) return i;
+    b = buffer.get(offset++);
+    i |= (b & 0x7F) << 7;
+    assert ((b & 0x80) == 0);
+    return i;
+  }
+  
+  private String readString(int offset, int length, boolean kana) {
+    char text[] = new char[length];
+    if (kana) {
+      for (int i = 0; i < length; i++) {
+        text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
+      }
+    } else {
+      for (int i = 0; i < length; i++) {
+        text[i] = buffer.getChar(offset + (i << 1));
+      }
+    }
+    return new String(text);
   }
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Mon Jan  9 08:10:37 2012
@@ -47,7 +47,6 @@ public interface Dictionary {
    * @param wordId word ID of token
    * @return Part-Of-Speech of the token
    */
-  // TODO: split into the type-safe components
   public String getPartOfSpeech(int wordId);
   
   /**
@@ -71,6 +70,19 @@ public interface Dictionary {
    */
   public String getPronunciation(int wordId);
   
+  /**
+   * Get inflection type of tokens
+   * @param wordId word ID of token
+   * @return inflection type, or null
+   */
+  public String getInflectionType(int wordId);
+  
+  /**
+   * Get inflection form of tokens
+   * @param wordId word ID of token
+   * @return inflection form, or null
+   */
+  public String getInflectionForm(int wordId);
   // TODO: maybe we should have a optimal method, a non-typesafe
   // 'getAdditionalData' if other dictionaries like unidic have additional data
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Mon Jan  9 08:10:37 2012
@@ -55,6 +55,16 @@ public final class UnknownDictionary ext
     return null;
   }
 
+  @Override
+  public String getInflectionType(int wordId) {
+    return null;
+  }
+
+  @Override
+  public String getInflectionForm(int wordId) {
+    return null;
+  }
+
   public static UnknownDictionary getInstance() {
     return SingletonHolder.INSTANCE;
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Mon Jan  9 08:10:37 2012
@@ -178,6 +178,16 @@ public final class UserDictionary implem
     return null; // TODO: add support?
   }
   
+  @Override
+  public String getInflectionType(int wordId) {
+    return null; // TODO: add support?
+  }
+
+  @Override
+  public String getInflectionForm(int wordId) {
+    return null; // TODO: add support?
+  }
+  
   private String[] getAllFeaturesArray(int wordId) {
     String allFeatures = featureEntries.get(wordId);
     if(allFeatures == null) {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24inflDict.dat?rev=1229052&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24inflDict.dat?rev=1229052&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SegmenterTest.java Mon Jan  9 08:10:37 2012
@@ -132,6 +132,36 @@ public class SegmenterTest extends Lucen
   }
   
   @Test
+  public void testInflectionTypes() {
+    List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
+    assertEquals(9, tokens.size());
+    assertNull(tokens.get(0).getInflectionType());
+    assertNull(tokens.get(1).getInflectionType());
+    assertNull(tokens.get(2).getInflectionType());
+    assertNull(tokens.get(3).getInflectionType());
+    assertNull(tokens.get(4).getInflectionType());
+    assertNull(tokens.get(5).getInflectionType());
+    assertEquals(tokens.get(6).getInflectionType(), "äºæ®µã»ã©è¡");
+    assertEquals(tokens.get(7).getInflectionType(), "ç¹æ®ã»ãã¹");
+    assertNull(tokens.get(8).getInflectionType());
+  }
+  
+  @Test
+  public void testInflectionForms() {
+    List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
+    assertEquals(9, tokens.size());
+    assertNull(tokens.get(0).getInflectionForm());
+    assertNull(tokens.get(1).getInflectionForm());
+    assertNull(tokens.get(2).getInflectionForm());
+    assertNull(tokens.get(3).getInflectionForm());
+    assertNull(tokens.get(4).getInflectionForm());
+    assertNull(tokens.get(5).getInflectionForm());
+    assertEquals(tokens.get(6).getInflectionForm(), "é£ç¨å½¢");
+    assertEquals(tokens.get(7).getInflectionForm(), "åºæ¬å½¢");
+    assertNull(tokens.get(8).getInflectionForm());
+  }
+  
+  @Test
   public void testPartOfSpeech() {
     List<Token> tokens = segmenter.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
     assertEquals(9, tokens.size());

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Mon Jan  9 08:10:37 2012
@@ -27,6 +27,7 @@ import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.HashMap;
 import java.util.Map;
@@ -35,11 +36,8 @@ import org.apache.lucene.store.DataOutpu
 import org.apache.lucene.store.OutputStreamDataOutput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CodecUtil;
-import org.apache.lucene.util.RamUsageEstimator;
 
-import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
 import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
 
 public abstract class BinaryDictionaryWriter {
   protected final Class<? extends BinaryDictionary> implClazz;
@@ -49,6 +47,9 @@ public abstract class BinaryDictionaryWr
   private int[] targetMapOffsets = new int[8192];
   private final List<String> posDict = new ArrayList<String>();
   private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
+  
+  private final List<String> inflDict = new ArrayList<String>();
+  private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
 
   public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
     this.implClazz = implClazz;
@@ -82,8 +83,26 @@ public abstract class BinaryDictionaryWr
       assert posDict.size() == posDictLookup.size();
     }
     
-    // TODO: what are the parts 9 and 10 that kuromoji does not expose via Token?
-    // we need to break all these out (we can structure them inside posdict)
+    sb.setLength(0);
+    sb.append(CSVUtil.quoteEscape(entry[8]));
+    sb.append(',');
+    sb.append(CSVUtil.quoteEscape(entry[9]));
+    String inflData = sb.toString();
+    
+    Integer inflIndex = Integer.MAX_VALUE;
+    int hasInflData;
+    if ("*,*".equals(inflData)) {
+      hasInflData = 0; // no inflection data
+    } else {
+      hasInflData = 1;
+      inflIndex = inflDictLookup.get(inflData);
+      if (inflIndex == null) {
+        inflIndex = inflDict.size();
+        inflDict.add(inflData);
+        inflDictLookup.put(inflData, inflIndex);
+        assert inflDict.size() == inflDictLookup.size();
+      }
+    }
     
     String baseForm = entry[10];
     String reading = entry[11];
@@ -91,8 +110,8 @@ public abstract class BinaryDictionaryWr
     
     // extend buffer if necessary
     int left = buffer.remaining();
-    // worst case: three short, 4 bytes and features (all as utf-16)
-    int worstCase = 6 + 4 + 2*(baseForm.length() + reading.length() + pronunciation.length());
+    // worst case: three short, 4 bytes, one vint and features (all as utf-16)
+    int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
     if (worstCase > left) {
       ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
       buffer.flip();
@@ -103,13 +122,16 @@ public abstract class BinaryDictionaryWr
     buffer.putShort(leftId);
     buffer.putShort(rightId);
     buffer.putShort(wordCost);
-    assert posIndex.intValue() < 256;
-    buffer.put(posIndex.byteValue());
+    assert posIndex.intValue() < 128;
+    buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
+    
+    int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
     
     if (baseForm.equals(entry[0])) {
-      buffer.put((byte)0); // base form is the same as surface form
+      buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
     } else {
-      buffer.put((byte)baseForm.length());
+      assert baseForm.length() < 128;
+      buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
       for (int i = 0; i < baseForm.length(); i++) {
         buffer.putChar(baseForm.charAt(i));
       }
@@ -125,9 +147,7 @@ public abstract class BinaryDictionaryWr
       }
     }
     
-    if (pronunciation.equals(reading)) {
-      buffer.put((byte)0); // pronunciation is the same as reading
-    } else {
+    if (pronunciationIsReading == 0) {
       if (isKatakana(pronunciation)) {
         buffer.put((byte) (pronunciation.length() << 1 | 1));
         writeKatakana(pronunciation);
@@ -139,6 +159,17 @@ public abstract class BinaryDictionaryWr
       }
     }
     
+    if (hasInflData > 0) {
+      int key = inflIndex.intValue();
+      assert key < 32768; // note there are really like 300 of these...
+      if (key < 128) {
+        buffer.put((byte) key);
+      } else {
+        buffer.put((byte) ((key & 0x7f) | 0x80));
+        buffer.put((byte) (key >>> 7));
+      }
+    }
+    
     return buffer.position();
   }
   
@@ -194,6 +225,7 @@ public abstract class BinaryDictionaryWr
     writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
     writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
     writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+    writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
   }
   
   // TODO: maybe this int[] should instead be the output to the FST...
@@ -242,6 +274,25 @@ public abstract class BinaryDictionaryWr
     }
   }
   
+  protected void writeInflDict(String filename) throws IOException {
+    new File(filename).getParentFile().mkdirs();
+    OutputStream os = new FileOutputStream(filename);
+    try {
+      os = new BufferedOutputStream(os);
+      final DataOutput out = new OutputStreamDataOutput(os);
+      CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
+      out.writeVInt(inflDict.size());
+      for (String s : inflDict) {
+        String data[] = CSVUtil.parse(s);
+        assert data.length == 2 : "malformed inflection: " + s;
+        out.writeString(data[0]);
+        out.writeString(data[1]);
+      }
+    } finally {
+      os.close();
+    }
+  }
+  
   protected void writeDictionary(String filename) throws IOException {
     new File(filename).getParentFile().mkdirs();
     final FileOutputStream os = new FileOutputStream(filename);
@@ -259,4 +310,58 @@ public abstract class BinaryDictionaryWr
     }
   }
   
+  // TODO: the below is messy, but makes the dictionary smaller.
+  // we track frequencies of inflections so the highest-freq ones have smaller indexes.
+
+  /** optional: notes inflection seen in the data up front */
+  public void noteInflection(String entry[]) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(CSVUtil.quoteEscape(entry[8]));
+    sb.append(',');
+    sb.append(CSVUtil.quoteEscape(entry[9]));
+    String s = sb.toString();
+    if ("*,*".equals(s)) {
+      return; // no inflection data
+    }
+    Integer freq = notedInflections.get(s);
+    if (freq == null) {
+      freq = 0;
+    }
+    notedInflections.put(s, freq+1);
+  }
+  
+  /** prepopulates inflection mapping by frequency */
+  public void finalizeInflections() {
+    InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
+    int upto = 0;
+    for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
+      freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
+    }
+    Arrays.sort(freqs, Collections.reverseOrder());
+    for (int i = 0; i < upto; i++) {
+      inflDict.add(freqs[i].inflection);
+      inflDictLookup.put(freqs[i].inflection, i);
+    }
+  }
+  
+  static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
+    String inflection;
+    int freq;
+    
+    InflectionAndFreq(String s, int i) {
+      this.inflection = s;
+      this.freq = i;
+    }
+    
+    public int compareTo(InflectionAndFreq other) {
+      int cmp = freq - other.freq;
+      if (cmp == 0) {
+        return inflection.compareTo(other.inflection);
+      } else {
+        return cmp;
+      }
+    }
+  }
+  
+  private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/ConnectionCostsWriter.java Mon Jan  9 08:10:37 2012
@@ -63,7 +63,6 @@ public final class ConnectionCostsWriter
       for (short[] a : costs) {
         assert a.length == forwardSize;
         for (int i = 0; i < a.length; i++) {
-          // TODO: when delta is 0, maybe we should RLE
           int delta = (int)a[i] - last;
           out.writeVInt((delta >> 31) ^ (delta << 1));
           last = a[i];

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1229052&r1=1229051&r2=1229052&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Mon Jan  9 08:10:37 2012
@@ -94,11 +94,15 @@ public class TokenInfoDictionaryBuilder 
       String line = null;
       while ((line = reader.readLine()) != null) {
         String[] entry = CSVUtil.parse(line);
+
         if(entry.length < 13) {
           System.out.println("Entry in CSV is not valid: " + line);
           continue;
         }
-        lines.add(formatEntry(entry));
+        
+        String[] formatted = formatEntry(entry);
+        dictionary.noteInflection(formatted);
+        lines.add(formatted);
         
         // NFKC normalize dictionary entry
         if (normalizeEntries) {
@@ -109,15 +113,19 @@ public class TokenInfoDictionaryBuilder 
           for (int i = 0; i < entry.length; i++) {
             normalizedEntry[i] = normalizer.normalize(entry[i]);
           }
-            
-          lines.add(formatEntry(normalizedEntry));
+          
+          formatted = formatEntry(normalizedEntry);
+          dictionary.noteInflection(formatted);
+          lines.add(formatted);
         }
       }
     }
     
+    dictionary.finalizeInflections();
+    
     System.out.println("  sort...");
 
-    // sort by term
+    // sort by term, then cost, then all other features
     Collections.sort(lines, new Comparator<String[]>() {
       public int compare(String[] left, String[] right) {
         return left[0].compareTo(right[0]);