You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/17 03:12:28 UTC

svn commit: r1232265 - in /lucene/dev/trunk/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ resources/org/apache/lucene/analysis/kuromoji/dict/ test/org/apache/lucene/analysis/kuro...

Author: rmuir
Date: Tue Jan 17 02:12:27 2012
New Revision: 1232265

URL: http://svn.apache.org/viewvc?rev=1232265&view=rev
Log:
LUCENE-3699: share baseform with surface and flag if the reading can be computed from surface

Modified:
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
    lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
    lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Tue Jan 17 02:12:27 2012
@@ -75,14 +75,14 @@ public class Token {
    * @return reading. null if token doesn't have reading.
    */
   public String getReading() {
-    return dictionary.getReading(wordId);
+    return dictionary.getReading(wordId, surfaceForm, offset, length);
   }
   
   /**
    * @return pronunciation. null if token doesn't have pronunciation.
    */
   public String getPronunciation() {
-    return dictionary.getPronunciation(wordId);
+    return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
   }
   
   /**
@@ -110,7 +110,7 @@ public class Token {
    * @return base form or null if token is not inflected
    */
   public String getBaseForm() {
-    return dictionary.getBaseForm(wordId);
+    return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
   }
   
   /**

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Tue Jan 17 02:12:27 2012
@@ -149,12 +149,12 @@ public abstract class BinaryDictionary i
   
   @Override	
   public int getLeftId(int wordId) {
-    return buffer.getShort(wordId) >>> 2;
+    return buffer.getShort(wordId) >>> 3;
   }
   
   @Override
   public int getRightId(int wordId) {
-    return buffer.getShort(wordId) >>> 2;
+    return buffer.getShort(wordId) >>> 3;
   }
   
   @Override
@@ -163,21 +163,42 @@ public abstract class BinaryDictionary i
   }
 
   @Override
-  public String getBaseForm(int wordId) {
+  public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
     if (hasBaseFormData(wordId)) {
       int offset = baseFormOffset(wordId);
-      int length = buffer.get(offset++) & 0xff;
-      return readString(offset, length, false);
+      int data = buffer.get(offset++) & 0xff;
+      int prefix = data >>> 4;
+      int suffix = data & 0xF;
+      char text[] = new char[prefix+suffix];
+      System.arraycopy(surfaceForm, off, text, 0, prefix);
+      for (int i = 0; i < suffix; i++) {
+        text[prefix+i] = buffer.getChar(offset + (i << 1));
+      }
+      return new String(text);
     } else {
       return null;
     }
   }
   
   @Override
-  public String getReading(int wordId) {
-    int offset = readingOffset(wordId);
-    int readingData = buffer.get(offset++) & 0xff;
-    return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+  public String getReading(int wordId, char surface[], int off, int len) {
+    if (hasReadingData(wordId)) {
+      int offset = readingOffset(wordId);
+      int readingData = buffer.get(offset++) & 0xff;
+      return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+    } else {
+      // the reading is the surface form, with hiragana shifted to katakana
+      char text[] = new char[len];
+      for (int i = 0; i < len; i++) {
+        char ch = surface[off+i];
+        if (ch > 0x3040 && ch < 0x3097) {
+          text[i] = (char)(ch + 0x60);
+        } else {
+          text[i] = ch;
+        }
+      }
+      return new String(text);
+    }
   }
   
   @Override
@@ -186,13 +207,13 @@ public abstract class BinaryDictionary i
   }
   
   @Override
-  public String getPronunciation(int wordId) {
+  public String getPronunciation(int wordId, char surface[], int off, int len) {
     if (hasPronunciationData(wordId)) {
       int offset = pronunciationOffset(wordId);
       int pronunciationData = buffer.get(offset++) & 0xff;
       return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
     } else {
-      return getReading(wordId); // same as the reading
+      return getReading(wordId, surface, off, len); // same as the reading
     }
   }
   
@@ -213,7 +234,7 @@ public abstract class BinaryDictionary i
   private int readingOffset(int wordId) {
     int offset = baseFormOffset(wordId);
     if (hasBaseFormData(wordId)) {
-      int baseFormLength = buffer.get(offset++) & 0xff;
+      int baseFormLength = buffer.get(offset++) & 0xf;
       return offset + (baseFormLength << 1);
     } else {
       return offset;
@@ -221,21 +242,29 @@ public abstract class BinaryDictionary i
   }
   
   private int pronunciationOffset(int wordId) {
-    int offset = readingOffset(wordId);
-    int readingData = buffer.get(offset++) & 0xff;
-    final int readingLength;
-    if ((readingData & 1) == 0) {
-      readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+    if (hasReadingData(wordId)) {
+      int offset = readingOffset(wordId);
+      int readingData = buffer.get(offset++) & 0xff;
+      final int readingLength;
+      if ((readingData & 1) == 0) {
+        readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+      } else {
+        readingLength = readingData >>> 1;
+      }
+      return offset + readingLength;
     } else {
-      readingLength = readingData >>> 1;
+      return readingOffset(wordId);
     }
-    return offset + readingLength;
   }
   
   private boolean hasBaseFormData(int wordId) {
     return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
   }
   
+  private boolean hasReadingData(int wordId) {
+    return (buffer.getShort(wordId) & HAS_READING) != 0;
+  }
+  
   private boolean hasPronunciationData(int wordId) {
     return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
   }
@@ -256,6 +285,8 @@ public abstract class BinaryDictionary i
   
   /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
   public static final int HAS_BASEFORM = 1;
+  /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
+  public static final int HAS_READING = 2;
   /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
-  public static final int HAS_PRONUNCIATION = 2;
+  public static final int HAS_PRONUNCIATION = 4;
 }

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Tue Jan 17 02:12:27 2012
@@ -54,21 +54,21 @@ public interface Dictionary {
    * @param wordId word ID of token
    * @return Reading of the token
    */
-  public String getReading(int wordId);
+  public String getReading(int wordId, char surface[], int off, int len);
   
   /**
    * Get base form of word
    * @param wordId word ID of token
    * @return Base form (only different for inflected words, otherwise null)
    */
-  public String getBaseForm(int wordId);
+  public String getBaseForm(int wordId, char surface[], int off, int len);
   
   /**
    * Get pronunciation of tokens
    * @param wordId word ID of token
    * @return Pronunciation of the token
    */
-  public String getPronunciation(int wordId);
+  public String getPronunciation(int wordId, char surface[], int off, int len);
   
   /**
    * Get inflection type of tokens

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Tue Jan 17 02:12:27 2012
@@ -51,7 +51,7 @@ public final class UnknownDictionary ext
   }
   
   @Override
-  public String getReading(int wordId) {
+  public String getReading(int wordId, char surface[], int off, int len) {
     return null;
   }
 

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Tue Jan 17 02:12:27 2012
@@ -196,7 +196,7 @@ public final class UserDictionary implem
   }
   
   @Override
-  public String getReading(int wordId) {
+  public String getReading(int wordId, char surface[], int off, int len) {
     return getFeature(wordId, 0);
   }
   
@@ -206,12 +206,12 @@ public final class UserDictionary implem
   }
   
   @Override
-  public String getBaseForm(int wordId) {
+  public String getBaseForm(int wordId, char surface[], int off, int len) {
     return null; // TODO: add support?
   }
   
   @Override
-  public String getPronunciation(int wordId) {
+  public String getPronunciation(int wordId, char surface[], int off, int len) {
     return null; // TODO: add support?
   }
   

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java Tue Jan 17 02:12:27 2012
@@ -61,7 +61,7 @@ public class TestTokenInfoDictionary ext
         assertTrue(wordId > lastWordId);
         lastWordId = wordId;
          
-        String baseForm = tid.getBaseForm(wordId);
+        String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
         assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
         
         String inflectionForm = tid.getInflectionForm(wordId);
@@ -91,11 +91,11 @@ public class TestTokenInfoDictionary ext
         // check that its actually an ipadic pos tag
         assertNotNull(ToStringUtil.getPOSTranslation(pos));
         
-        String pronunciation = tid.getPronunciation(wordId);
+        String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
         assertNotNull(pronunciation);
         assertTrue(UnicodeUtil.validUTF16String(pronunciation));
         
-        String reading = tid.getReading(wordId);
+        String reading = tid.getReading(wordId, chars, 0, chars.length);
         assertNotNull(reading);
         assertTrue(UnicodeUtil.validUTF16String(reading));
       }

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java Tue Jan 17 02:12:27 2012
@@ -73,12 +73,12 @@ public class UserDictionaryTest extends 
     int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
-    assertEquals("ニホン", dictionary.getReading(wordIdNihon));
+    assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2));
     
     result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
     assertEquals(1, result.length);
     int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
-    assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
+    assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3));
   }
   
   @Test

Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java?rev=1232265&r1=1232264&r2=1232265&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java Tue Jan 17 02:12:27 2012
@@ -103,12 +103,15 @@ public abstract class BinaryDictionaryWr
     if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
       flags |= BinaryDictionary.HAS_BASEFORM;
     }
+    if (!reading.equals(toKatakana(entry[0]))) {
+      flags |= BinaryDictionary.HAS_READING;
+    }
     if (!pronunciation.equals(reading)) {
       flags |= BinaryDictionary.HAS_PRONUNCIATION;
     }
 
     assert leftId == rightId;
-    assert leftId < 8192; // there are still unused bits
+    assert leftId < 4096; // there are still unused bits
     // add pos mapping
     int toFill = 1+leftId - posDict.size();
     for (int i = 0; i < toFill; i++) {
@@ -119,27 +122,36 @@ public abstract class BinaryDictionaryWr
     assert existing == null || existing.equals(fullPOSData);
     posDict.set(leftId, fullPOSData);
     
-    buffer.putShort((short)(leftId << 2 | flags));
+    buffer.putShort((short)(leftId << 3 | flags));
     buffer.putShort(wordCost);
 
     if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
-      buffer.put((byte) baseForm.length());
-      for (int i = 0; i < baseForm.length(); i++) {
+      assert baseForm.length() < 16;
+      int shared = sharedPrefix(entry[0], baseForm);
+      int suffix = baseForm.length() - shared;
+      buffer.put((byte) (shared << 4 | suffix));
+      for (int i = shared; i < baseForm.length(); i++) {
         buffer.putChar(baseForm.charAt(i));
       }
     }
     
-    if (isKatakana(reading)) {
-      buffer.put((byte) (reading.length() << 1 | 1));
-      writeKatakana(reading);
-    } else {
-      buffer.put((byte) (reading.length() << 1));
-      for (int i = 0; i < reading.length(); i++) {
-        buffer.putChar(reading.charAt(i));
+    if ((flags & BinaryDictionary.HAS_READING) != 0) {
+      if (isKatakana(reading)) {
+        buffer.put((byte) (reading.length() << 1 | 1));
+        writeKatakana(reading);
+      } else {
+        buffer.put((byte) (reading.length() << 1));
+        for (int i = 0; i < reading.length(); i++) {
+          buffer.putChar(reading.charAt(i));
+        }
       }
     }
     
     if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
+      // we can save 150KB here, but it makes the reader a little complicated.
+      // int shared = sharedPrefix(reading, pronunciation);
+      // buffer.put((byte) shared);
+      // pronunciation = pronunciation.substring(shared);
       if (isKatakana(pronunciation)) {
         buffer.put((byte) (pronunciation.length() << 1 | 1));
         writeKatakana(pronunciation);
@@ -170,6 +182,27 @@ public abstract class BinaryDictionaryWr
     }
   }
   
+  private String toKatakana(String s) {
+    char text[] = new char[s.length()];
+    for (int i = 0; i < s.length(); i++) {
+      char ch = s.charAt(i);
+      if (ch > 0x3040 && ch < 0x3097) {
+        text[i] = (char)(ch + 0x60);
+      } else {
+        text[i] = ch;
+      }
+    }
+    return new String(text);
+  }
+  
+  public static int sharedPrefix(String left, String right) {
+    int len = left.length() < right.length() ? left.length() : right.length();
+    for (int i = 0; i < len; i++)
+      if (left.charAt(i) != right.charAt(i))
+        return i;
+    return len;
+  }
+  
   public void addMapping(int sourceId, int wordId) {
     assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;