You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 22:29:10 UTC

svn commit: r1534364 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: data/ java/org/apache/lucene/analysis/ko/dic/ resources/org/apache/lucene/analysis/ko/dic/ tools/java/org/apache/lucene/analysis/ko/dic/

Author: rmuir
Date: Mon Oct 21 20:29:09 2013
New Revision: 1534364

URL: http://svn.apache.org/r1534364
Log:
LUCENE-4956: use a byte1 jamo FST, smaller and much faster

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic Mon Oct 21 20:29:09 2013
@@ -20606,8 +20606,6 @@
 옳,010000000X
 옴,100000000X
 옴니버스,100000000X
-옴지락거리ㅌ,010000000X
-옴지락대ㅌ,010000000X
 옴짝달싹,001100000X
 옴쭉달싹,001100000X
 옴츠리,010000000X
@@ -24399,7 +24397,6 @@
 쟁론,100100000X
 쟁반,100000000X
 쟁알거리,010000000X
-쟁알대ㅌ,010000000X
 쟁의,100100000X
 쟁이,010000000X
 쟁쟁하,010000000X

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic Mon Oct 21 20:29:09 2013
@@ -4341,7 +4341,6 @@
 케냐프,100110000X
 케라틴,100110000X
 케비넬,100110000X
-케이ㅋ,100110000X
 케이른스,100110000X
 케이슨,100110000X
 케이지,100110000X
@@ -4523,8 +4522,6 @@
 크로토노니트릴,100110000X
 크로톤알데히드,100110000X
 크롤러,100110000X
-크롬 Ⅲ,100110000X
-크롬 Ⅵ,100110000X
 크루시에이타,100110000X
 크리글러,100110000X
 크리깅,100110000X

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 21 20:29:09 2013
@@ -30,6 +30,12 @@ class HangulDictionary {
   
   static final int RECORD_SIZE = 15;
   
+  static final int SBASE = 0xAC00;
+  static final int HANGUL_B0 = 0xE0 | (SBASE >> 12);
+  static final int VCOUNT = 21;
+  static final int TCOUNT = 28;
+  static final int NCOUNT = VCOUNT * TCOUNT;
+  
   public HangulDictionary(FST<Byte> fst, byte[] metadata) {
     this.fst = fst;
     this.metadata = metadata;
@@ -49,14 +55,42 @@ class HangulDictionary {
     Byte output = fst.outputs.getNoOutput();
     for (int i = 0; i < key.length(); i++) {
       try {
-        if (fst.findTargetArc(key.charAt(i), arc, arc, fstReader) == null) {
+        char ch = key.charAt(i);
+        if (ch < 0xFF) {
+          // latin-1: remap to hangul syllable
+          if (fst.findTargetArc(HANGUL_B0, arc, arc, fstReader) == null) {
+            return null;
+          }
+          output = fst.outputs.add(output, arc.output);
+          if (fst.findTargetArc(0x80 | ((ch >> 6) & 0x3F), arc, arc, fstReader) == null) {
+            return null;
+          }
+          output = fst.outputs.add(output, arc.output);
+          if (fst.findTargetArc(0x80 | (ch & 0x3F), arc, arc, fstReader) == null) {
+            return null;
+          }
+          output = fst.outputs.add(output, arc.output);
+        } else if (ch >= SBASE && ch <= 0xD7AF) {
+          // hangul syllable: decompose to jamo and remap to latin-1
+          ch -= SBASE;
+          if (fst.findTargetArc(ch / NCOUNT, arc, arc, fstReader) == null) {
+            return null;
+          }
+          output = fst.outputs.add(output, arc.output);
+          if (fst.findTargetArc((ch % NCOUNT) / TCOUNT, arc, arc, fstReader) == null) {
+            return null;
+          }
+          output = fst.outputs.add(output, arc.output);
+          if (fst.findTargetArc(ch % TCOUNT, arc, arc, fstReader) == null) {
+            return null;
+          }
+          output = fst.outputs.add(output, arc.output);
+        } else {
           return null;
         }
       } catch (IOException bogus) {
         throw new RuntimeException();
       }
-      // we shouldnt need this accumulation?!
-      output = fst.outputs.add(output, arc.output);
     }
 
     if (arc.isFinal()) {
@@ -119,7 +153,31 @@ class HangulDictionary {
 
     for (int i = 0; i < key.length(); i++) {
       try {
-        if (fst.findTargetArc(key.charAt(i), arc, arc, fstReader) == null) {
+        char ch = key.charAt(i);
+        if (ch < 0xFF) {
+          // latin-1: remap to hangul syllable
+          if (fst.findTargetArc(HANGUL_B0, arc, arc, fstReader) == null) {
+            return false;
+          }
+          if (fst.findTargetArc(0x80 | ((ch >> 6) & 0x3F), arc, arc, fstReader) == null) {
+            return false;
+          }
+          if (fst.findTargetArc(0x80 | (ch & 0x3F), arc, arc, fstReader) == null) {
+            return false;
+          }
+        } else if (ch >= SBASE && ch <= 0xD7AF) {
+          // hangul syllable: decompose to jamo and remap to latin-1
+          ch -= SBASE;
+          if (fst.findTargetArc(ch / NCOUNT, arc, arc, fstReader) == null) {
+            return false;
+          }
+          if (fst.findTargetArc((ch % NCOUNT) / TCOUNT, arc, arc, fstReader) == null) {
+            return false;
+          }
+          if (fst.findTargetArc(ch % TCOUNT, arc, arc, fstReader) == null) {
+            return false;
+          }
+        } else {
           return false;
         }
       } catch (IOException bogus) {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Mon Oct 21 20:29:09 2013
@@ -28,6 +28,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -225,12 +226,42 @@ public class DictionaryBuilder {
   
   
   /** 
-   * makes FST (currently byte2 syllables) mapping to "word class"
+   * makes FST (byte1) mapping to "word class"
+   * syllables are decomposed to jamo, then swapped with latin1
+   * (this makes the FST both smaller and much faster)
    * each word has features + compound data, but many of them share the
    * same set of features, and have simple compound splits in the same place.
    */
   static void buildHangulDict(File inputDir, File outputDir) throws Exception {
-    TreeMap<String,Integer> sorted = new TreeMap<String,Integer>();
+    TreeMap<String,Integer> sorted = new TreeMap<String,Integer>(new Comparator<String>() {
+
+      @Override
+      public int compare(String a, String b) {
+        final int stop = Math.min(a.length(), b.length());
+        int upto = 0;
+        while(upto < stop) {
+          int aChar = remap(a.charAt(upto));
+          int bChar = remap(b.charAt(upto));
+
+          int diff = aChar - bChar;
+          if (diff != 0) {
+            return diff;
+          }
+          upto++;
+        }
+        // One is a prefix of the other, or, they are equal:
+        return a.length() - b.length();
+      }
+      
+      int remap(char ch) {
+        if (ch < 0xff) {
+          return ch + HangulDictionary.SBASE;
+        } else {
+          assert ch >= HangulDictionary.SBASE && ch <= 0xD7AF : ch;
+          return ch - HangulDictionary.SBASE;
+        }
+      }
+    });
     Map<Output,Integer> classes = new LinkedHashMap<>();
     File input = new File(inputDir, "dictionary.dic");
     BufferedReader reader = new BufferedReader(IOUtils.getDecodingReader(input, IOUtils.CHARSET_UTF_8));
@@ -261,14 +292,27 @@ public class DictionaryBuilder {
     System.out.println("#classes: " + classes.size());
     Outputs<Byte> fstOutput = ByteOutputs.getSingleton();
     // why does packed=false give a smaller fst?!?!
-    Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
+    Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
     IntsRef scratch = new IntsRef();
     for (Map.Entry<String,Integer> e : sorted.entrySet()) {
       String token = e.getKey();
-      scratch.grow(token.length());
-      scratch.length = token.length();
+      scratch.grow(token.length() * 3);
+      scratch.length = token.length() * 3;
       for (int i = 0; i < token.length(); i++) {
-        scratch.ints[i] = (int) token.charAt(i);
+        char ch = token.charAt(i);
+        if (ch < 0xFF) {
+          scratch.ints[3*i] = HangulDictionary.HANGUL_B0;
+          scratch.ints[3*i+1] = (0x80 | ((ch >> 6) & 0x3F));
+          scratch.ints[3*i+2] = (0x80 | (ch & 0x3F));
+        } else if (ch >= HangulDictionary.SBASE && ch <= 0xD7AF) {
+          // hangul syllable: decompose to jamo and remap to latin-1
+          ch -= HangulDictionary.SBASE;
+          scratch.ints[3*i] = (ch / HangulDictionary.NCOUNT);
+          scratch.ints[3*i+1] = ((ch % HangulDictionary.NCOUNT) / HangulDictionary.TCOUNT);
+          scratch.ints[3*i+2] = (ch % HangulDictionary.TCOUNT);
+        } else {
+          assert false : ch;
+        }
       }
       int v = e.getValue();
       assert v >= 0 && v < 128;