You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/05 15:58:51 UTC
svn commit: r1227618 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji: ./ src/java/org/apache/lucene/analysis/kuromoji/dict/ src/test/org/apache/lucene/analysis/kuromoji/dict/ src/tools/java/org/apache/lucene/analysis/kuromoji/util/ src/t...

Author: rmuir
Date: Thu Jan  5 14:58:50 2012
New Revision: 1227618

URL: http://svn.apache.org/viewvc?rev=1227618&view=rev
Log:
LUCENE-3305: split dictionary reading/writing, nuke files before build-dict to ensure nothing is stale, ensure tools/ breakage breaks the build since its important here

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java   (with props)
Removed:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml?rev=1227618&r1=1227617&r2=1227618&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/build.xml Thu Jan  5 14:58:50 2012
@@ -54,34 +54,68 @@
     <fileset dir="../icu/lib" includes="icu4j-*.jar"/>
   </path>
 
+  <path id="tools.classpath">
+    <path refid="classpath"/>
+    <path refid="tools.dependencies"/>
+    <pathelement location="${build.dir}/classes/java"/>
+    <pathelement location="${build.dir}/classes/tools"/>
+  </path>
+
+  <path id="tools.test.classpath">
+    <path refid="tools.classpath"/>
+    <path refid="test.base.classpath"/>
+    <pathelement location="${build.dir}/classes/tools-test"/>
+  </path>
+
   <target name="build-dict" depends="compile-tools, download-dict">
-    <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
-      <classpath>
-        <path refid="classpath"/>
-        <pathelement path="${build.dir}/classes/java"/>
-        <pathelement path="${build.dir}/classes/tools"/>
-        <path refid="tools.dependencies"/>
-      </classpath>
-      <assertions>
-        <enable package="org.apache.lucene"/>
-      </assertions>
-      <arg value="${dict.format}"/>
-      <arg value="${dict.src.dir}"/>
-      <arg value="${dict.target.dir}"/>
-      <arg value="${dict.encoding}"/>
-      <arg value="${dict.normalize}"/>
-    </java>
+    <sequential>
+      <delete verbose="true">
+        <fileset dir="src/resources/org/apache/lucene/analysis/kuromoji" includes="**/*"/>
+      </delete>
+      <java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
+        <classpath>
+          <path refid="tools.classpath"/>
+          <pathelement path="${build.dir}/classes/tools"/>
+        </classpath>
+        <assertions>
+          <enable package="org.apache.lucene"/>
+        </assertions>
+        <arg value="${dict.format}"/>
+        <arg value="${dict.src.dir}"/>
+        <arg value="${dict.target.dir}"/>
+        <arg value="${dict.encoding}"/>
+        <arg value="${dict.normalize}"/>
+      </java>
+    </sequential>
   </target>
 
-    <target name="compile-tools" depends="compile-core, common.compile-tools">
+  <target name="compile-tools" depends="compile-core, common.compile-tools">
     <compile
       srcdir="src/tools/java"
       destdir="${build.dir}/classes/tools">
       <classpath>
-        <path refid="classpath"/>
-        <pathelement path="${build.dir}/classes/java"/>
-        <path refid="tools.dependencies"/>
+        <path refid="tools.classpath"/>
+        <pathelement path="src/tools/java"/>
       </classpath>
     </compile>
   </target>
+
+  <target name="compile-tools-tests" depends="compile-tools">
+    <compile
+      srcdir="src/tools/test"
+      destdir="${build.dir}/classes/tools-test">
+      <classpath>
+        <path refid="tools.test.classpath"/>
+        <pathelement path="src/tools/test"/>
+      </classpath>
+     </compile>
+  </target>
+
+  <target name="test-tools" depends="compile-tools-tests">
+    <test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
+  </target>
+
+  <target name="compile-test" depends="contrib-build.compile-test, compile-tools-tests"/>
+  <target name="test" depends="contrib-build.test, test-tools"/>
+
 </project>

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1227618&r1=1227617&r2=1227618&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Thu Jan  5 14:58:50 2012
@@ -18,24 +18,15 @@ package org.apache.lucene.analysis.kurom
  */
 
 import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
 import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
-import java.nio.channels.WritableByteChannel;
 
 import org.apache.lucene.store.DataInput;
-import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.InputStreamDataInput;
-import org.apache.lucene.store.OutputStreamDataOutput;
 import org.apache.lucene.util.CodecUtil;
 
 import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
@@ -53,72 +44,6 @@ public class TokenInfoDictionary impleme
   
   protected int[][] targetMap;
   
-  public TokenInfoDictionary() {
-  }
-  
-  public TokenInfoDictionary(int size) {
-    targetMap = new int[1][];
-    buffer = ByteBuffer.allocate(size);
-  }
-  
-  /**
-   * put the entry in map
-   * @return current position of buffer, which will be wordId of next entry
-   */
-  public int put(String[] entry) {
-    short leftId = Short.parseShort(entry[1]);
-    short rightId = Short.parseShort(entry[2]);
-    short wordCost = Short.parseShort(entry[3]);
-    
-    StringBuilder sb = new StringBuilder();
-    for (int i = 4; i < entry.length; i++){
-      sb.append(entry[i]).append(INTERNAL_SEPARATOR);
-    }
-    String features = sb.deleteCharAt(sb.length() - 1).toString();
-    int featuresSize = features.length()* 2;
-    
-    // extend buffer if necessary
-    int left = buffer.limit() - buffer.position();
-    if (8 + featuresSize > left) { // four short and features
-      ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
-      buffer.flip();
-      newBuffer.put(buffer);
-      buffer = newBuffer;
-    }
-    
-    buffer.putShort(leftId);
-    buffer.putShort(rightId);
-    buffer.putShort(wordCost);
-    buffer.putShort((short)featuresSize);
-    for (char c : features.toCharArray()){
-      buffer.putChar(c);
-    }
-    
-    return buffer.position();
-  }
-  
-  public void addMapping(int sourceId, int wordId) {
-    if(targetMap.length <= sourceId) {
-      int[][] newArray = new int[sourceId + 1][];
-      System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
-      targetMap = newArray;
-    }
-    
-    // Prepare array -- extend the length of array by one
-    int[] current = targetMap[sourceId];
-    if (current == null) {
-      current = new int[1];
-    } else {
-      int[] newArray = new int[current.length + 1];
-      System.arraycopy(current, 0, newArray, 0, current.length);
-      current = newArray;
-    }
-    targetMap[sourceId] = current;
-    
-    int[] targets = targetMap[sourceId];
-    targets[targets.length - 1] = wordId;
-  }
-  
   public int[] lookupWordIds(int sourceId) {
     return targetMap[sourceId];
   }
@@ -190,69 +115,6 @@ public class TokenInfoDictionary impleme
     String form = getFeature(wordId, 6);
     return "*".equals(form) ? null : form;
   }
-
-  /**
-   * Write dictionary in file
-   * Dictionary format is:
-   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
-   * @throws IOException
-   */
-  public void write(String directoryname) throws IOException {
-    writeDictionary(directoryname + File.separator + FILENAME);
-    writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
-  }
-  
-  protected void writeTargetMap(String filename) throws IOException {
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
-      final DataOutput out = new OutputStreamDataOutput(os);
-      CodecUtil.writeHeader(out, TARGETMAP_HEADER, VERSION);
-      out.writeVInt(targetMap.length);
-      int nulls = 0;
-      for (int[] a : targetMap) {
-        if (a == null) {
-          // run-length encoding for all nulls:
-          if (nulls == 0) {
-            out.writeVInt(0);
-          }
-          nulls++;
-        } else {
-          if (nulls > 0) {
-            out.writeVInt(nulls);
-            nulls = 0;
-          }
-          assert a.length > 0;
-          out.writeVInt(a.length);
-          for (int i = 0; i < a.length; i++) {
-            out.writeVInt(a[i]);
-          }
-        }
-      }
-      // write the pending RLE count:
-      if (nulls > 0) {
-        out.writeVInt(nulls);
-      }
-    } finally {
-      os.close();
-    }
-  }
-  
-  protected void writeDictionary(String filename) throws IOException {
-    final FileOutputStream os = new FileOutputStream(filename);
-    try {
-      final DataOutput out = new OutputStreamDataOutput(os);
-      CodecUtil.writeHeader(out, DICT_HEADER, VERSION);
-      out.writeVInt(buffer.position());
-      final WritableByteChannel channel = Channels.newChannel(os);
-      // Write Buffer
-      buffer.flip();  // set position to 0, set limit to current position
-      channel.write(buffer);
-      assert buffer.remaining() == 0L;
-    } finally {
-      os.close();
-    }
-  }
   
   /**
    * Read dictionary into directly allocated buffer.
@@ -308,5 +170,4 @@ public class TokenInfoDictionary impleme
       is.close();
     }
   }
-  
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1227618&r1=1227617&r2=1227618&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java Thu Jan  5 14:58:50 2012
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
-import java.io.File;
 import java.io.IOException;
 
 public class UnknownDictionary extends TokenInfoDictionary {
@@ -28,31 +27,6 @@ public class UnknownDictionary extends T
   
   private CharacterDefinition characterDefinition;
   
-  /**
-   * Constructor
-   */
-  public UnknownDictionary() {
-  }
-  
-  public UnknownDictionary(int size) {
-    super(size);
-    characterDefinition = new CharacterDefinition();    	
-  }
-  
-  @Override
-  public int put(String[] entry) {
-    // Get wordId of current entry
-    int wordId = buffer.position();
-    
-    // Put entry
-    int result = super.put(entry);
-    
-    // Put entry in targetMap
-    int characterId = CharacterDefinition.lookupCharacterClass(entry[0]);
-    addMapping(characterId, wordId);
-    return result;
-  }
-  
   public int lookup(String text) {
     if(!characterDefinition.isGroup(text.charAt(0))) {
       return 1;
@@ -72,37 +46,10 @@ public class UnknownDictionary extends T
     return length;
   }
   
-  /**
-   * Put mapping from unicode code point to character class.
-   * 
-   * @param codePoint code point
-   * @param characterClassName character class name
-   */
-  public void putCharacterCategory(int codePoint, String characterClassName) {
-    characterDefinition.putCharacterCategory(codePoint, characterClassName);
-  }
-  
-  public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
-    characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
-  }
-  
-  
   public CharacterDefinition getCharacterDefinition() {
     return characterDefinition;
   }
   
-  /**
-   * Write dictionary in file
-   * Dictionary format is:
-   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
-   * @throws IOException
-   */
-  public void write(String directoryname) throws IOException {
-    writeDictionary(directoryname + File.separator + FILENAME);
-    writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
-    characterDefinition.write(directoryname);
-  }
-  
   public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException {
     UnknownDictionary dictionary = new UnknownDictionary();
     dictionary.characterDefinition = CharacterDefinition.getInstance();

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java?rev=1227618&r1=1227617&r2=1227618&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/DictionaryBuilder.java Thu Jan  5 14:58:50 2012
@@ -41,7 +41,7 @@ public class DictionaryBuilder {
       boolean normalizeEntry) throws IOException {
     System.out.println("building tokeninfo dict...");
     TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
-    TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
+    TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
     
     System.out.print("  building double array trie...");
     DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet());
@@ -65,7 +65,7 @@ public class DictionaryBuilder {
     
     System.out.print("building unknown word dict...");
     UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
-    UnknownDictionary unkDictionary = unkBuilder.build(inputDirname);
+    UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
     unkDictionary.write(outputDirname+File.separatorChar+UnknownDictionary.class.getPackage().getName().replace('.',File.separatorChar));
     System.out.println("done");
     

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1227618&r1=1227617&r2=1227618&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Thu Jan  5 14:58:50 2012
@@ -33,7 +33,6 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.TreeMap;
 
-import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder.DictionaryFormat;
 
 import com.ibm.icu.text.Normalizer2;
@@ -66,7 +65,7 @@ public class TokenInfoDictionaryBuilder 
     this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
   }
   
-  public TokenInfoDictionary build(String dirname) throws IOException {
+  public TokenInfoDictionaryWriter build(String dirname) throws IOException {
     FilenameFilter filter = new FilenameFilter() {
       @Override
       public boolean accept(File dir, String name) {
@@ -81,8 +80,8 @@ public class TokenInfoDictionaryBuilder 
     return buildDictionary(csvFiles);
   }
   
-  public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
-    TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024);
+  public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
+    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
     
     for (File file : csvFiles){
       FileInputStream inputStream = new FileInputStream(file);

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java?rev=1227618&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryWriter.java Thu Jan  5 14:58:50 2012
@@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.CodecUtil;
+
+import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
+
+public class TokenInfoDictionaryWriter {
+  protected ByteBuffer buffer;
+  protected int[][] targetMap = new int[1][];
+  
+  public TokenInfoDictionaryWriter(int size) {
+    targetMap = new int[1][];
+    buffer = ByteBuffer.allocate(size);
+  }
+  
+  /**
+   * put the entry in map
+   * @return current position of buffer, which will be wordId of next entry
+   */
+  public int put(String[] entry) {
+    short leftId = Short.parseShort(entry[1]);
+    short rightId = Short.parseShort(entry[2]);
+    short wordCost = Short.parseShort(entry[3]);
+    
+    StringBuilder sb = new StringBuilder();
+    for (int i = 4; i < entry.length; i++){
+      sb.append(entry[i]).append(Dictionary.INTERNAL_SEPARATOR);
+    }
+    String features = sb.deleteCharAt(sb.length() - 1).toString();
+    int featuresSize = features.length()* 2;
+    
+    // extend buffer if necessary
+    int left = buffer.limit() - buffer.position();
+    if (8 + featuresSize > left) { // four short and features
+      ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
+      buffer.flip();
+      newBuffer.put(buffer);
+      buffer = newBuffer;
+    }
+    
+    buffer.putShort(leftId);
+    buffer.putShort(rightId);
+    buffer.putShort(wordCost);
+    buffer.putShort((short)featuresSize);
+    for (char c : features.toCharArray()){
+      buffer.putChar(c);
+    }
+    
+    return buffer.position();
+  }
+  
+  public void addMapping(int sourceId, int wordId) {
+    if(targetMap.length <= sourceId) {
+      int[][] newArray = new int[sourceId + 1][];
+      System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
+      targetMap = newArray;
+    }
+    
+    // Prepare array -- extend the length of array by one
+    int[] current = targetMap[sourceId];
+    if (current == null) {
+      current = new int[1];
+    } else {
+      int[] newArray = new int[current.length + 1];
+      System.arraycopy(current, 0, newArray, 0, current.length);
+      current = newArray;
+    }
+    targetMap[sourceId] = current;
+    
+    int[] targets = targetMap[sourceId];
+    targets[targets.length - 1] = wordId;
+  }
+
+  /**
+   * Write dictionary in file
+   * Dictionary format is:
+   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+   * @throws IOException
+   */
+  public void write(String directoryname) throws IOException {
+    writeDictionary(directoryname + File.separator + TokenInfoDictionary.FILENAME);
+    writeTargetMap(directoryname + File.separator + TokenInfoDictionary.TARGETMAP_FILENAME);
+  }
+  
+  protected void writeTargetMap(String filename) throws IOException {
+    OutputStream os = new FileOutputStream(filename);
+    try {
+      os = new BufferedOutputStream(os);
+      final DataOutput out = new OutputStreamDataOutput(os);
+      CodecUtil.writeHeader(out, TokenInfoDictionary.TARGETMAP_HEADER, TokenInfoDictionary.VERSION);
+      out.writeVInt(targetMap.length);
+      int nulls = 0;
+      for (int[] a : targetMap) {
+        if (a == null) {
+          // run-length encoding for all nulls:
+          if (nulls == 0) {
+            out.writeVInt(0);
+          }
+          nulls++;
+        } else {
+          if (nulls > 0) {
+            out.writeVInt(nulls);
+            nulls = 0;
+          }
+          assert a.length > 0;
+          out.writeVInt(a.length);
+          for (int i = 0; i < a.length; i++) {
+            out.writeVInt(a[i]);
+          }
+        }
+      }
+      // write the pending RLE count:
+      if (nulls > 0) {
+        out.writeVInt(nulls);
+      }
+    } finally {
+      os.close();
+    }
+  }
+  
+  protected void writeDictionary(String filename) throws IOException {
+    final FileOutputStream os = new FileOutputStream(filename);
+    try {
+      final DataOutput out = new OutputStreamDataOutput(os);
+      CodecUtil.writeHeader(out, TokenInfoDictionary.DICT_HEADER, TokenInfoDictionary.VERSION);
+      out.writeVInt(buffer.position());
+      final WritableByteChannel channel = Channels.newChannel(os);
+      // Write Buffer
+      buffer.flip();  // set position to 0, set limit to current position
+      channel.write(buffer);
+      assert buffer.remaining() == 0L;
+    } finally {
+      os.close();
+    }
+  }
+}

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java?rev=1227618&r1=1227617&r2=1227618&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java Thu Jan  5 14:58:50 2012
@@ -26,8 +26,6 @@ import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 
-import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
-
 public class UnknownDictionaryBuilder {
   private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*";
   
@@ -41,21 +39,20 @@ public class UnknownDictionaryBuilder {
     this.encoding = encoding;
   }
   
-  public UnknownDictionary build(String dirname) throws IOException {
-    UnknownDictionary unkDictionary = null;
-    unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
+  public UnknownDictionaryWriter build(String dirname) throws IOException {
+    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
     readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
     return unkDictionary;
   }
   
-  public UnknownDictionary readDictionaryFile(String filename)
+  public UnknownDictionaryWriter readDictionaryFile(String filename)
       throws IOException {
     return readDictionaryFile(filename, encoding);
   }
   
-  public UnknownDictionary readDictionaryFile(String filename, String encoding)
+  public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
       throws IOException {
-    UnknownDictionary dictionary = new UnknownDictionary(5 * 1024 * 1024);
+    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
     
     FileInputStream inputStream = new FileInputStream(filename);
     Charset cs = Charset.forName(encoding);
@@ -75,7 +72,7 @@ public class UnknownDictionaryBuilder {
     return dictionary;
   }
   
-  public void readCharacterDefinition(String filename, UnknownDictionary dictionary) throws IOException {
+  public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
     FileInputStream inputStream = new FileInputStream(filename);
     InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
     LineNumberReader lineReader = new LineNumberReader(streamReader);

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java?rev=1227618&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryWriter.java Thu Jan  5 14:58:50 2012
@@ -0,0 +1,55 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
+import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
+
+public class UnknownDictionaryWriter extends TokenInfoDictionaryWriter {
+  private final CharacterDefinition characterDefinition = new CharacterDefinition();
+  
+  public UnknownDictionaryWriter(int size) {
+    super(size);
+  }
+  
+  @Override
+  public int put(String[] entry) {
+    // Get wordId of current entry
+    int wordId = buffer.position();
+    
+    // Put entry
+    int result = super.put(entry);
+    
+    // Put entry in targetMap
+    int characterId = CharacterDefinition.lookupCharacterClass(entry[0]);
+    addMapping(characterId, wordId);
+    return result;
+  }
+  
+  /**
+   * Put mapping from unicode code point to character class.
+   * 
+   * @param codePoint code point
+   * @param characterClassName character class name
+   */
+  public void putCharacterCategory(int codePoint, String characterClassName) {
+    characterDefinition.putCharacterCategory(codePoint, characterClassName);
+  }
+  
+  public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
+    characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
+  }
+  
+  /**
+   * Write dictionary in file
+   * Dictionary format is:
+   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
+   * @throws IOException
+   */
+  public void write(String directoryname) throws IOException {
+    writeDictionary(directoryname + File.separator + UnknownDictionary.FILENAME);
+    writeTargetMap(directoryname + File.separator + UnknownDictionary.TARGETMAP_FILENAME);
+    characterDefinition.write(directoryname);
+  }
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java?rev=1227618&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionaryTest.java Thu Jan  5 14:58:50 2012
@@ -0,0 +1,75 @@
+package org.apache.lucene.analysis.kuromoji.dict;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
+import org.apache.lucene.analysis.kuromoji.util.UnknownDictionaryWriter;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+public class UnknownDictionaryTest extends LuceneTestCase {
+  public static final String FILENAME = "unk-tokeninfo-dict.obj";
+  
+  @Test
+  public void testPutCharacterCategory() {
+    UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
+    
+    try{
+      unkDic.putCharacterCategory(0, "DUMMY_NAME");
+      fail();
+    } catch(Exception e) {
+      
+    }
+    
+    try{
+      unkDic.putCharacterCategory(-1, "KATAKANA");
+      fail();
+    } catch(Exception e) {
+      
+    }
+    
+    unkDic.putCharacterCategory(0, "DEFAULT");
+    unkDic.putCharacterCategory(1, "GREEK");
+    unkDic.putCharacterCategory(2, "HIRAGANA");
+    unkDic.putCharacterCategory(3, "KATAKANA");
+    unkDic.putCharacterCategory(4, "KANJI");
+  }
+  
+  @Test
+  public void testPut() {
+    UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
+    try{
+      unkDic.put(CSVUtil.parse("KANJI,1285,11426,åè©,ä¸è¬,*,*,*,*,*"));
+      fail();
+    } catch(Exception e){
+      
+    }
+    
+    String entry1 = "KANJI,1285,1285,11426,åè©,ä¸è¬,*,*,*,*,*";
+    String entry2 = "ALPHA,1285,1285,13398,åè©,ä¸è¬,*,*,*,*,*";
+    String entry3 = "HIRAGANA,1285,1285,13069,åè©,ä¸è¬,*,*,*,*,*";
+    
+    unkDic.putCharacterCategory(0, "KANJI");
+    unkDic.putCharacterCategory(1, "ALPHA");
+    unkDic.putCharacterCategory(2, "HIRAGANA");
+    
+    unkDic.put(CSVUtil.parse(entry1));
+    unkDic.put(CSVUtil.parse(entry2));
+    unkDic.put(CSVUtil.parse(entry3));
+  }
+}