You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/06 05:17:54 UTC

svn commit: r1227952 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/dict/ resources/org/apache/lucene/analysis/kuromoji/dict/ test/org/apache/lucene/analysis/kuromoji/ tools/java/org/apache/...

Author: rmuir
Date: Fri Jan  6 04:17:52 2012
New Revision: 1227952

URL: http://svn.apache.org/viewvc?rev=1227952&view=rev
Log:
LUCENE-3305: deduplicate POS, reduces tokeninfodict 14MB

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat   (with props)
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java Fri Jan  6 04:17:52 2012
@@ -37,18 +37,22 @@ public abstract class BinaryDictionary i
   
   public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
   public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
+  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
   
-  public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
   public static final String DICT_HEADER = "kuromoji_dict";
+  public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
+  public static final String POSDICT_HEADER = "kuromoji_dict_pos";
   public static final int VERSION = 1;
   
   private final ByteBuffer buffer;
   private final int[][] targetMap;
+  private final String[] posDict;
   
   protected BinaryDictionary() throws IOException {
-    InputStream mapIS = null, dictIS = null;
+    InputStream mapIS = null, dictIS = null, posIS = null;
     IOException priorE = null;
     int[][] targetMap = null;
+    String[] posDict = null;
     ByteBuffer buffer = null;
     try {
       mapIS = getClass().getResourceAsStream(getClass().getSimpleName() + TARGETMAP_FILENAME_SUFFIX);
@@ -72,6 +76,17 @@ public abstract class BinaryDictionary i
           j++;
         }
       }
+      
+      posIS = getClass().getResourceAsStream(getClass().getSimpleName() + POSDICT_FILENAME_SUFFIX);
+      if (posIS == null)
+        throw new FileNotFoundException("Not in classpath: " + getClass().getName().replace('.','/') + POSDICT_FILENAME_SUFFIX);
+      posIS = new BufferedInputStream(posIS);
+      in = new InputStreamDataInput(posIS);
+      CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
+      posDict = new String[in.readVInt()];
+      for (int j = 0; j < posDict.length; j++) {
+        posDict[j] = in.readString();
+      }
 
       dictIS = getClass().getResourceAsStream(getClass().getSimpleName() + DICT_FILENAME_SUFFIX);
       if (dictIS == null)
@@ -89,10 +104,11 @@ public abstract class BinaryDictionary i
     } catch (IOException ioe) {
       priorE = ioe;
     } finally {
-      IOUtils.closeWhileHandlingException(priorE, mapIS, dictIS);
+      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
     }
     
     this.targetMap = targetMap;
+    this.posDict = posDict;
     this.buffer = buffer;
   }
   
@@ -115,13 +131,22 @@ public abstract class BinaryDictionary i
     return buffer.getShort(wordId + 4);	// Skip left id and right id
   }
   
+  // TODO: this method will likely never be efficient, do we need it?
   @Override
   public String[] getAllFeaturesArray(int wordId) {
-    int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
-    char[] targetArr = new char[size];
-    int offset = wordId + 6 + 2; // offset is position where features string starts
+    char posIndex = buffer.getChar(wordId + 6); // read index into posDict
+    String pos = posDict[posIndex];
+    int posLen = pos.length();
+    int size = buffer.getShort(wordId + 8) / 2; // Read length of feature String. Skip 8 bytes, see data structure.
+    char[] targetArr = new char[posLen + 1 + size]; // pos + separator + the rest
+    int offset = wordId + 8 + 2; // offset is position where features string starts
+    for (int i = 0; i < pos.length(); i++) {
+      targetArr[i] = pos.charAt(i);
+    }
+    int upto = posLen;
+    targetArr[upto++] = INTERNAL_SEPARATOR.charAt(0);
     for(int i = 0; i < size; i++){
-      targetArr[i] = buffer.getChar(offset + i * 2);
+      targetArr[upto++] = buffer.getChar(offset + i * 2);
     }
     String allFeatures = new String(targetArr);
     return allFeatures.split(INTERNAL_SEPARATOR);

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1227952&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1227952&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Fri Jan  6 04:17:52 2012
@@ -91,6 +91,21 @@ public class TokenizerTest extends Lucen
     assertNull(tokens.get(8).getBaseForm());
   }
   
+  @Test
+  public void testPartOfSpeech() {
+    List<Token> tokens = tokenizer.tokenize("それはまだ実験段階にあります。");
+    assertEquals(9, tokens.size());
+    assertEquals("名詞,代名詞,一般,*",  tokens.get(0).getPartOfSpeech());
+    assertEquals("助詞,係助詞,*,*",    tokens.get(1).getPartOfSpeech());
+    assertEquals("副詞,助詞類接続,*,*", tokens.get(2).getPartOfSpeech());
+    assertEquals("名詞,サ変接続,*,*",   tokens.get(3).getPartOfSpeech());
+    assertEquals("名詞,一般,*,*",      tokens.get(4).getPartOfSpeech());
+    assertEquals("助詞,格助詞,一般,*",  tokens.get(5).getPartOfSpeech());
+    assertEquals("動詞,自立,*,*",      tokens.get(6).getPartOfSpeech());
+    assertEquals("助動詞,*,*,*",       tokens.get(7).getPartOfSpeech());
+    assertEquals("記号,句点,*,*",      tokens.get(8).getPartOfSpeech());
+  }
+  
   public void testBocchan() throws Exception {
     doTestBocchan(1);
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java Fri Jan  6 04:17:52 2012
@@ -25,6 +25,8 @@ import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
+import java.util.ArrayList;
+import java.util.List;
 
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.OutputStreamDataOutput;
@@ -38,7 +40,8 @@ import org.apache.lucene.analysis.kuromo
 public class TokenInfoDictionaryWriter {
   protected ByteBuffer buffer;
   protected int[][] targetMap = new int[1][];
-  
+  protected List<String> posDict = new ArrayList<String>();
+
   public TokenInfoDictionaryWriter(int size) {
     targetMap = new int[1][];
     buffer = ByteBuffer.allocate(size);
@@ -54,7 +57,23 @@ public class TokenInfoDictionaryWriter {
     short wordCost = Short.parseShort(entry[3]);
     
     StringBuilder sb = new StringBuilder();
-    for (int i = 4; i < entry.length; i++){
+    
+    // build up the POS string
+    for (int i = 4; i < 10; i++) {
+      sb.append(entry[i]);
+      if (i < 9) {
+        sb.append(Dictionary.INTERNAL_SEPARATOR);
+      }
+    }
+    String pos = sb.toString();
+    int posIndex = posDict.indexOf(pos);
+    if (posIndex < 0) {
+      posIndex = posDict.size();
+      posDict.add(pos);
+    }
+    
+    sb.setLength(0);
+    for (int i = 10; i < entry.length; i++){
       sb.append(entry[i]).append(Dictionary.INTERNAL_SEPARATOR);
     }
     String features = sb.deleteCharAt(sb.length() - 1).toString();
@@ -62,7 +81,7 @@ public class TokenInfoDictionaryWriter {
     
     // extend buffer if necessary
     int left = buffer.remaining();
-    if (8 + featuresSize > left) { // four short and features
+    if (10 + featuresSize > left) { // five short and features
       ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + 1, 1));
       buffer.flip();
       newBuffer.put(buffer);
@@ -72,6 +91,8 @@ public class TokenInfoDictionaryWriter {
     buffer.putShort(leftId);
     buffer.putShort(rightId);
     buffer.putShort(wordCost);
+    assert posIndex < Character.MAX_VALUE;
+    buffer.putChar((char)posIndex);
     buffer.putShort((short)featuresSize);
     for (char c : features.toCharArray()){
       buffer.putChar(c);
@@ -111,6 +132,7 @@ public class TokenInfoDictionaryWriter {
   public void write(String baseDir) throws IOException {
     writeDictionary(baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.DICT_FILENAME_SUFFIX);
     writeTargetMap(baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+    writePosDict(baseDir + File.separator + TokenInfoDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
   }
   
   protected void writeTargetMap(String filename) throws IOException {
@@ -150,6 +172,22 @@ public class TokenInfoDictionaryWriter {
     }
   }
   
+  protected void writePosDict(String filename) throws IOException {
+    new File(filename).getParentFile().mkdirs();
+    OutputStream os = new FileOutputStream(filename);
+    try {
+      os = new BufferedOutputStream(os);
+      final DataOutput out = new OutputStreamDataOutput(os);
+      CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
+      out.writeVInt(posDict.size());
+      for (String s : posDict) {
+        out.writeString(s);
+      }
+    } finally {
+      os.close();
+    }
+  }
+  
   protected void writeDictionary(String filename) throws IOException {
     new File(filename).getParentFile().mkdirs();
     final FileOutputStream os = new FileOutputStream(filename);

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java?rev=1227952&r1=1227951&r2=1227952&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java Fri Jan  6 04:17:52 2012
@@ -51,6 +51,7 @@ public class UnknownDictionaryWriter ext
   public void write(String baseDir) throws IOException {
     writeDictionary(baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.DICT_FILENAME_SUFFIX);
     writeTargetMap(baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+    writePosDict(baseDir + File.separator + UnknownDictionary.class.getName().replace('.', File.separatorChar) + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
     characterDefinition.write(baseDir);
   }
 }