You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 22:29:10 UTC
svn commit: r1534364 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: data/
java/org/apache/lucene/analysis/ko/dic/
resources/org/apache/lucene/analysis/ko/dic/
tools/java/org/apache/lucene/analysis/ko/dic/
Author: rmuir
Date: Mon Oct 21 20:29:09 2013
New Revision: 1534364
URL: http://svn.apache.org/r1534364
Log:
LUCENE-4956: use a byte1 jamo FST, smaller and much faster
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/dictionary.dic Mon Oct 21 20:29:09 2013
@@ -20606,8 +20606,6 @@
ì³,010000000X
ì´,100000000X
ì´ëë²ì¤,100000000X
-ì´ì§ë½ê±°ë¦¬ã
,010000000X
-ì´ì§ë½ëã
,010000000X
ì´ì§ë¬ì¹,001100000X
ì´ìë¬ì¹,001100000X
ì´ì¸ 리,010000000X
@@ -24399,7 +24397,6 @@
ìë¡ ,100100000X
ìë°,100000000X
ìì거리,010000000X
-ììëã
,010000000X
ìì,100100000X
ìì´,010000000X
ììí,010000000X
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic Mon Oct 21 20:29:09 2013
@@ -4341,7 +4341,6 @@
ì¼ëí,100110000X
ì¼ë¼í´,100110000X
ì¼ë¹ë¬,100110000X
-ì¼ì´ã
,100110000X
ì¼ì´ë¥¸ì¤,100110000X
ì¼ì´ì¨,100110000X
ì¼ì´ì§,100110000X
@@ -4523,8 +4522,6 @@
í¬ë¡í ë
¸ëí¸ë¦´,100110000X
í¬ë¡í¤ìë°íë,100110000X
í¬ë¡¤ë¬,100110000X
-í¬ë¡¬ â
¢,100110000X
-í¬ë¡¬ â
¥,100110000X
í¬ë£¨ììì´í,100110000X
í¬ë¦¬ê¸ë¬,100110000X
í¬ë¦¬ê¹
,100110000X
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 21 20:29:09 2013
@@ -30,6 +30,12 @@ class HangulDictionary {
static final int RECORD_SIZE = 15;
+ static final int SBASE = 0xAC00;
+ static final int HANGUL_B0 = 0xE0 | (SBASE >> 12);
+ static final int VCOUNT = 21;
+ static final int TCOUNT = 28;
+ static final int NCOUNT = VCOUNT * TCOUNT;
+
public HangulDictionary(FST<Byte> fst, byte[] metadata) {
this.fst = fst;
this.metadata = metadata;
@@ -49,14 +55,42 @@ class HangulDictionary {
Byte output = fst.outputs.getNoOutput();
for (int i = 0; i < key.length(); i++) {
try {
- if (fst.findTargetArc(key.charAt(i), arc, arc, fstReader) == null) {
+ char ch = key.charAt(i);
+ if (ch < 0xFF) {
+ // latin-1: remap to hangul syllable
+ if (fst.findTargetArc(HANGUL_B0, arc, arc, fstReader) == null) {
+ return null;
+ }
+ output = fst.outputs.add(output, arc.output);
+ if (fst.findTargetArc(0x80 | ((ch >> 6) & 0x3F), arc, arc, fstReader) == null) {
+ return null;
+ }
+ output = fst.outputs.add(output, arc.output);
+ if (fst.findTargetArc(0x80 | (ch & 0x3F), arc, arc, fstReader) == null) {
+ return null;
+ }
+ output = fst.outputs.add(output, arc.output);
+ } else if (ch >= SBASE && ch <= 0xD7AF) {
+ // hangul syllable: decompose to jamo and remap to latin-1
+ ch -= SBASE;
+ if (fst.findTargetArc(ch / NCOUNT, arc, arc, fstReader) == null) {
+ return null;
+ }
+ output = fst.outputs.add(output, arc.output);
+ if (fst.findTargetArc((ch % NCOUNT) / TCOUNT, arc, arc, fstReader) == null) {
+ return null;
+ }
+ output = fst.outputs.add(output, arc.output);
+ if (fst.findTargetArc(ch % TCOUNT, arc, arc, fstReader) == null) {
+ return null;
+ }
+ output = fst.outputs.add(output, arc.output);
+ } else {
return null;
}
} catch (IOException bogus) {
throw new RuntimeException();
}
- // we shouldnt need this accumulation?!
- output = fst.outputs.add(output, arc.output);
}
if (arc.isFinal()) {
@@ -119,7 +153,31 @@ class HangulDictionary {
for (int i = 0; i < key.length(); i++) {
try {
- if (fst.findTargetArc(key.charAt(i), arc, arc, fstReader) == null) {
+ char ch = key.charAt(i);
+ if (ch < 0xFF) {
+ // latin-1: remap to hangul syllable
+ if (fst.findTargetArc(HANGUL_B0, arc, arc, fstReader) == null) {
+ return false;
+ }
+ if (fst.findTargetArc(0x80 | ((ch >> 6) & 0x3F), arc, arc, fstReader) == null) {
+ return false;
+ }
+ if (fst.findTargetArc(0x80 | (ch & 0x3F), arc, arc, fstReader) == null) {
+ return false;
+ }
+ } else if (ch >= SBASE && ch <= 0xD7AF) {
+ // hangul syllable: decompose to jamo and remap to latin-1
+ ch -= SBASE;
+ if (fst.findTargetArc(ch / NCOUNT, arc, arc, fstReader) == null) {
+ return false;
+ }
+ if (fst.findTargetArc((ch % NCOUNT) / TCOUNT, arc, arc, fstReader) == null) {
+ return false;
+ }
+ if (fst.findTargetArc(ch % TCOUNT, arc, arc, fstReader) == null) {
+ return false;
+ }
+ } else {
return false;
}
} catch (IOException bogus) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1534364&r1=1534363&r2=1534364&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Mon Oct 21 20:29:09 2013
@@ -28,6 +28,7 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -225,12 +226,42 @@ public class DictionaryBuilder {
/**
- * makes FST (currently byte2 syllables) mapping to "word class"
+ * makes FST (byte1) mapping to "word class"
+ * syllables are decomposed to jamo, then swapped with latin1
+ * (this makes the FST both smaller and much faster)
* each word has features + compound data, but many of them share the
* same set of features, and have simple compound splits in the same place.
*/
static void buildHangulDict(File inputDir, File outputDir) throws Exception {
- TreeMap<String,Integer> sorted = new TreeMap<String,Integer>();
+ TreeMap<String,Integer> sorted = new TreeMap<String,Integer>(new Comparator<String>() {
+
+ @Override
+ public int compare(String a, String b) {
+ final int stop = Math.min(a.length(), b.length());
+ int upto = 0;
+ while(upto < stop) {
+ int aChar = remap(a.charAt(upto));
+ int bChar = remap(b.charAt(upto));
+
+ int diff = aChar - bChar;
+ if (diff != 0) {
+ return diff;
+ }
+ upto++;
+ }
+ // One is a prefix of the other, or, they are equal:
+ return a.length() - b.length();
+ }
+
+ int remap(char ch) {
+ if (ch < 0xff) {
+ return ch + HangulDictionary.SBASE;
+ } else {
+ assert ch >= HangulDictionary.SBASE && ch <= 0xD7AF : ch;
+ return ch - HangulDictionary.SBASE;
+ }
+ }
+ });
Map<Output,Integer> classes = new LinkedHashMap<>();
File input = new File(inputDir, "dictionary.dic");
BufferedReader reader = new BufferedReader(IOUtils.getDecodingReader(input, IOUtils.CHARSET_UTF_8));
@@ -261,14 +292,27 @@ public class DictionaryBuilder {
System.out.println("#classes: " + classes.size());
Outputs<Byte> fstOutput = ByteOutputs.getSingleton();
// why does packed=false give a smaller fst?!?!
- Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
+ Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
IntsRef scratch = new IntsRef();
for (Map.Entry<String,Integer> e : sorted.entrySet()) {
String token = e.getKey();
- scratch.grow(token.length());
- scratch.length = token.length();
+ scratch.grow(token.length() * 3);
+ scratch.length = token.length() * 3;
for (int i = 0; i < token.length(); i++) {
- scratch.ints[i] = (int) token.charAt(i);
+ char ch = token.charAt(i);
+ if (ch < 0xFF) {
+ scratch.ints[3*i] = HangulDictionary.HANGUL_B0;
+ scratch.ints[3*i+1] = (0x80 | ((ch >> 6) & 0x3F));
+ scratch.ints[3*i+2] = (0x80 | (ch & 0x3F));
+ } else if (ch >= HangulDictionary.SBASE && ch <= 0xD7AF) {
+ // hangul syllable: decompose to jamo and remap to latin-1
+ ch -= HangulDictionary.SBASE;
+ scratch.ints[3*i] = (ch / HangulDictionary.NCOUNT);
+ scratch.ints[3*i+1] = ((ch % HangulDictionary.NCOUNT) / HangulDictionary.TCOUNT);
+ scratch.ints[3*i+2] = (ch % HangulDictionary.TCOUNT);
+ } else {
+ assert false : ch;
+ }
}
int v = e.getValue();
assert v >= 0 && v < 128;