You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/19 18:18:54 UTC

svn commit: r1533782 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang: ./ src/java/org/apache/lucene/analysis/ko/dic/ src/java/org/apache/lucene/analysis/ko/utils/ src/resources/org/apache/lucene/analysis/ko/dic/ src/test/org/apache/lucene/...

Author: rmuir
Date: Sat Oct 19 16:18:54 2013
New Revision: 1533782

URL: http://svn.apache.org/r1533782
Log:
LUCENE-4965: don't expose this as a large hashmap with thousands of arrays

Added:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.dat   (with props)
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.idx   (with props)
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java   (with props)
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
      - copied, changed from r1533695, lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java
Removed:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/mapHanja.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java
Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml?rev=1533782&r1=1533781&r2=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/build.xml Sat Oct 19 16:18:54 2013
@@ -35,10 +35,11 @@
   <!-- for rebuilding dictionary -->
   <path id="tools.classpath">
     <path refid="classpath"/>
+    <pathelement location="${build.dir}/classes/java"/>
     <pathelement location="${build.dir}/classes/tools"/>
   </path>
 
-  <target name="compile-tools" depends="common.compile-tools">
+  <target name="compile-tools" depends="common.compile-tools,compile-core">
     <compile
       srcdir="src/tools/java"
       destdir="${build.dir}/classes/tools">
@@ -56,7 +57,7 @@
       <delete verbose="true">
         <fileset dir="${dict.target.dir}" includes="**/*"/>
       </delete>
-      <java fork="true" failonerror="true" maxmemory="256M" classname="org.apache.lucene.analysis.ko.DictionaryBuilder">
+      <java fork="true" failonerror="true" maxmemory="256M" classname="org.apache.lucene.analysis.ko.dic.DictionaryBuilder">
         <classpath>
           <path refid="tools.classpath"/>
         </classpath>

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533782&r1=1533781&r2=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Sat Oct 19 16:18:54 2013
@@ -49,8 +49,9 @@ public class DictionaryResources {
   
   public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
   
-  public static final String FILE_MAP_HANJA_DIC = "mapHanja.dic";
-
+  public static final String FILE_HANJA_IDX = "hanja.idx";
+  public static final String FILE_HANJA_DAT = "hanja.dat";
+  public static final int DATA_VERSION = 0;
 
   private DictionaryResources() {}
 
@@ -86,5 +87,4 @@ public class DictionaryResources {
       processor.processLine(line);
     }
   }
-
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java?rev=1533782&r1=1533781&r2=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/HanjaUtils.java Sat Oct 19 16:18:54 2013
@@ -18,44 +18,60 @@ package org.apache.lucene.analysis.ko.ut
  */
 
 import java.io.IOException;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
+import java.io.InputStream;
 
 import org.apache.lucene.analysis.ko.dic.DictionaryResources;
-import org.apache.lucene.analysis.ko.dic.LineProcessor;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 
 public class HanjaUtils {
   private HanjaUtils() {}
 
-  private static final Map<Character, char[]> mapHanja;
+  private static final int HANJA_START = 0x3400;
+  private static final MonotonicBlockPackedReader index;
+  private static final char[] data;
   static {
+    InputStream datStream = null, idxStream = null;
     try {
-      final Map<Character, char[]> map = new HashMap<Character, char[]>();    
-      DictionaryResources.readLines(DictionaryResources.FILE_MAP_HANJA_DIC, new LineProcessor() {
-        @Override
-        public void processLine(String s) throws IOException {
-          String[] hanInfos = s.split("[,]+");
-          if(hanInfos.length!=2 || hanInfos[0].length()!=1)
-            throw new IOException("Invalid file format: "+s);
-          
-          map.put(hanInfos[0].charAt(0), hanInfos[1].toCharArray());
-        }
-      });      
-      mapHanja = Collections.unmodifiableMap(map);
+      datStream = DictionaryResources.class.getResourceAsStream(DictionaryResources.FILE_HANJA_DAT);
+      idxStream = DictionaryResources.class.getResourceAsStream(DictionaryResources.FILE_HANJA_IDX);
+      DataInput dat = new InputStreamDataInput(datStream);
+      DataInput idx = new InputStreamDataInput(idxStream);
+      CodecUtil.checkHeader(dat, DictionaryResources.FILE_HANJA_DAT, DictionaryResources.DATA_VERSION, DictionaryResources.DATA_VERSION);
+      CodecUtil.checkHeader(idx, DictionaryResources.FILE_HANJA_IDX, DictionaryResources.DATA_VERSION, DictionaryResources.DATA_VERSION);
+      data = new char[dat.readVInt()];
+      for (int i = 0; i < data.length; i++) {
+        data[i] = (char) dat.readShort();
+        assert Character.UnicodeBlock.of(data[i]) == Character.UnicodeBlock.HANGUL_SYLLABLES;
+      }
+      index = new MonotonicBlockPackedReader(idx, idx.readVInt(), idx.readVInt(), idx.readVInt(), false);
     } catch (IOException ioe) {
-      throw new Error("Cannot load: " + DictionaryResources.FILE_MAP_HANJA_DIC, ioe);
+      throw new Error(ioe);
+    } finally {
+      IOUtils.closeWhileHandlingException(datStream, idxStream);
     }
   }
   
-  /**
-   * 한자에 대응하는 한글을 찾아서 반환한다.
-   * 하나의 한자는 여러 음으로 읽일 수 있으므로 가능한 모든 음을 한글로 반환한다.
-   */
+  /** 
+   * Returns array of hangul pronunciations.
+   * TODO: expose this in another way */
   public static char[] convertToHangul(char hanja) {
-//    if(hanja>0x9FFF||hanja<0x3400) return new char[]{hanja};
-    
-    final char[] result = mapHanja.get(hanja);
-    return (result==null) ? new char[]{hanja} : result;
+    if (hanja < HANJA_START) {
+      return new char[] { hanja };
+    } else {
+      int idx = hanja - HANJA_START;
+      int start = (int) index.get(idx);
+      int end = (int) index.get(idx+1);
+      if (end - start == 0) {
+        return new char[] { hanja };
+      } else {
+        char result[] = new char[end - start];
+        System.arraycopy(data, start, result, 0, end - start);
+        return result;
+      }
+    }
   }
 }

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.dat?rev=1533782&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.idx
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/hanja.idx?rev=1533782&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java?rev=1533782&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestHanjaUtils.java Sat Oct 19 16:18:54 2013
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.ko.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestHanjaUtils extends LuceneTestCase {
+  
+  public void testOneToOne() {
+    assertEquals("구",  new String(HanjaUtils.convertToHangul('㐀')));
+    assertEquals("인",  new String(HanjaUtils.convertToHangul('印')));
+  }
+  
+  public void testOneToMany() {
+    assertEquals("기지",  new String(HanjaUtils.convertToHangul('枳')));
+  }
+  
+  public void testOutOfBounds() {
+    assertEquals("\u33FF", new String(HanjaUtils.convertToHangul('\u33FF')));
+    assertEquals("A", new String(HanjaUtils.convertToHangul('A')));
+    assertEquals("\uFF09", new String(HanjaUtils.convertToHangul('\uFF09')));
+  }
+  
+  public void testEitherHangulOrItselfBack() {
+    for (int i = 0; i <= 0xFFFF; i++) {
+      char res[] = HanjaUtils.convertToHangul((char)i);
+      if (res.length == 1 && res[0] == i) {
+        continue;
+      } else {
+        assert res.length > 0;
+        for (int j = 0; j < res.length; j++) {
+          assertEquals(Character.UnicodeBlock.HANGUL_SYLLABLES, Character.UnicodeBlock.of(res[j]));
+        }
+      }
+    }
+  }
+}

Copied: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (from r1533695, lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?p2=lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java&p1=lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java&r1=1533695&r2=1533782&rev=1533782&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Sat Oct 19 16:18:54 2013
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.ko;
+package org.apache.lucene.analysis.ko.dic;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -19,16 +19,35 @@ package org.apache.lucene.analysis.ko;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.InputStream;
 import java.io.OutputStream;
 
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
+import org.apache.lucene.util.packed.PackedInts;
+
 public class DictionaryBuilder {
   public static void main(String args[]) throws Exception {
-    String FILES_AS_IS[] = { "compounds.dic", "dictionary.dic", "eomi.dic", "extension.dic", "josa.dic", "mapHanja.dic",
-                       "prefix.dic", "suffix.dic", "syllable.dic", "uncompounds.dic" };
+    String FILES_AS_IS[] = { 
+      DictionaryResources.FILE_COMPOUNDS,
+      DictionaryResources.FILE_DICTIONARY,
+      DictionaryResources.FILE_EOMI,
+      DictionaryResources.FILE_EXTENSION,
+      DictionaryResources.FILE_JOSA,
+      DictionaryResources.FILE_PREFIX,
+      DictionaryResources.FILE_SUFFIX,
+      DictionaryResources.FILE_SYLLABLE_FEATURE,
+      DictionaryResources.FILE_UNCOMPOUNDS
+    };
+    
     File inputDir = new File(args[0]);
     File outputDir = new File(args[1]);
     for (String file : FILES_AS_IS) {
@@ -36,6 +55,7 @@ public class DictionaryBuilder {
       File out = new File(outputDir, file);
       copyAsIs(in, out);
     }
+    buildHanjaMap(inputDir, outputDir);
   }
   
   static void copyAsIs(File in, File out) throws Exception {
@@ -48,4 +68,56 @@ public class DictionaryBuilder {
     r.close();
     w.close();
   }
+  
+  static void buildHanjaMap(File inputDir, File outputDir) throws Exception {
+    final int HANJA_START = 0x3400;
+    final int IDX_SIZE = 0x10000 - HANJA_START;
+    OutputStream idxStream = new BufferedOutputStream(new FileOutputStream(new File(outputDir, DictionaryResources.FILE_HANJA_IDX)));
+    DataOutput idx = new OutputStreamDataOutput(idxStream);
+    CodecUtil.writeHeader(idx, DictionaryResources.FILE_HANJA_IDX, DictionaryResources.DATA_VERSION);
+    idx.writeVInt(PackedInts.VERSION_CURRENT);
+    idx.writeVInt(1024);
+    idx.writeVInt(IDX_SIZE+1); // CJK: first half of unicode, compat: at the end. but monotonic's blocking works here (?)
+    MonotonicBlockPackedWriter idxArray = new MonotonicBlockPackedWriter(idx, 1024);
+    
+    OutputStream datStream = new BufferedOutputStream(new FileOutputStream(new File(outputDir, DictionaryResources.FILE_HANJA_DAT)));
+    DataOutput dat = new OutputStreamDataOutput(datStream);
+    CodecUtil.writeHeader(dat, DictionaryResources.FILE_HANJA_DAT, DictionaryResources.DATA_VERSION);
+    char datArray[] = new char[256];
+    File input = new File(inputDir, "mapHanja.dic");
+    BufferedReader reader = new BufferedReader(IOUtils.getDecodingReader(input, IOUtils.CHARSET_UTF_8));
+    int currentInput = -1;
+    int currentOutput = 0;
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (line.startsWith("!")) {
+        continue;
+      }
+      int cp = line.charAt(0) - HANJA_START;
+      while (currentInput < cp) {
+        idxArray.add(currentOutput);
+        currentInput++;
+      }
+      String mappings = line.substring(2);
+      for (int i = 0; i < mappings.length(); i++) {
+        if (currentOutput == datArray.length) {
+          datArray = ArrayUtil.grow(datArray);
+        }
+        datArray[currentOutput] = mappings.charAt(i);
+        currentOutput++;
+      }
+      currentInput = cp;
+    }
+    while (currentInput < IDX_SIZE) {
+      idxArray.add(currentOutput);
+      currentInput++;
+    }
+    idxArray.finish();
+    dat.writeVInt(currentOutput);
+    for (int i = 0; i < currentOutput; i++) {
+      dat.writeShort((short) datArray[i]); 
+    }
+    idxStream.close();
+    datStream.close();
+  }
 }