You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/19 21:54:37 UTC
svn commit: r1533813 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/morph/
java/org/apache/lucene/analysis/ko/utils/
resources/org/apache/lucene/analysis/ko/d...
Author: rmuir
Date: Sat Oct 19 19:54:37 2013
New Revision: 1533813
URL: http://svn.apache.org/r1533813
Log:
LUCENE-4956: refactor syllable handling to not be a list of thousands of arrays
Added:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dat (with props)
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestSyllableUtil.java (with props)
Removed:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dic
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/EomiUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/MorphUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/NounUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Sat Oct 19 19:54:37 2013
@@ -31,8 +31,6 @@ import org.apache.lucene.util.IOUtils;
*/
public class DictionaryResources {
- public static final String FILE_SYLLABLE_FEATURE = "syllable.dic";
-
public static final String FILE_DICTIONARY = "dictionary.dic";
public static final String FILE_JOSA = "josa.dic";
@@ -49,6 +47,7 @@ public class DictionaryResources {
public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
+ public static final String FILE_SYLLABLE_DAT = "syllable.dat";
public static final String FILE_HANJA_IDX = "hanja.idx";
public static final String FILE_HANJA_DAT = "hanja.dat";
public static final int DATA_VERSION = 0;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java Sat Oct 19 19:54:37 2013
@@ -226,24 +226,28 @@ public class MorphAnalyzer {
boolean isVerbOnly = false;
analysisWithEomi(input,"",candidates);
- for(int i=strlen-1;i>0;i--) {
+ for (int i = strlen-1; i > 0; i--) {
- String stem = input.substring(0,i);
+ String stem = input.substring(0, i);
String eomi = input.substring(i);
- char[] feature = SyllableUtil.getFeature(eomi.charAt(0));
- if(!isVerbOnly&&josaFlag&&feature[SyllableUtil.IDX_JOSA1]=='1') {
- analysisWithJosa(stem,eomi,candidates);
+ char ch = eomi.charAt(0);
+ if (!isVerbOnly && josaFlag && SyllableUtil.hasFeature(ch, SyllableUtil.JOSA1)) {
+ analysisWithJosa(stem, eomi, candidates);
}
- if(eomiFlag) {
- analysisWithEomi(stem,eomi,candidates);
+ if (eomiFlag) {
+ analysisWithEomi(stem, eomi, candidates);
+ eomiFlag &= SyllableUtil.hasFeature(ch, SyllableUtil.EOMI2);
}
- if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false;
- if(eomiFlag&&feature[SyllableUtil.IDX_EOMI2]=='0') eomiFlag = false;
+ if (josaFlag) {
+ josaFlag &= SyllableUtil.hasFeature(ch, SyllableUtil.JOSA2);
+ }
- if(!josaFlag&&!eomiFlag) break;
+ if (!josaFlag && !eomiFlag) {
+ break;
+ }
}
}
@@ -359,8 +363,8 @@ public class MorphAnalyzer {
output.setScore(AnalysisOutput.SCORE_CORRECT);
MorphUtil.buildPtnVM(output, candidates);
- char[] features = SyllableUtil.getFeature(stem.charAt(stem.length()-1)); // ã¹ë¶ê·ì¹ì¼ ê²½ì°
- if((features[SyllableUtil.IDX_YNPLN]=='0'||morphs[1].charAt(0)!='ã´')
+ char ch = stem.charAt(stem.length()-1); // ã¹ë¶ê·ì¹ì¼ ê²½ì°
+ if ((SyllableUtil.hasFeature(ch, SyllableUtil.YNPLN) == false || morphs[1].charAt(0) != 'ã´')
&&!"ë".equals(end)) // "ê°(V),ë" ë¶ìë ì ìëë¡
return;
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java Sat Oct 19 19:54:37 2013
@@ -50,7 +50,7 @@ public class WordSpaceAnalyzer {
for(int i=0;i<input.length();i++) {
- char[] f = SyllableUtil.getFeature(input.charAt(i));
+ char ch = input.charAt(i);
String prefix = i==input.length()-1 ? "X" : input.substring(wStart,i+2);
Iterator<String[]> iter = DictionaryUtil.findWithPrefix(prefix);
@@ -73,12 +73,15 @@ public class WordSpaceAnalyzer {
candidates.add(buildSingleOutput(entry));
// í ìì ì´ ì¡°ì¬ë ì´ë¯¸ê° ììëë ìì ì¼ ê°ë¥ì±ì´ ìë¤ë©´...
- } else if(f[SyllableUtil.IDX_EOGAN]=='1'||f[SyllableUtil.IDX_JOSA1]=='1'){
- if(f[SyllableUtil.IDX_JOSA1]=='1')
+ } else if (SyllableUtil.hasFeature(ch, SyllableUtil.EOGAN) ||
+ SyllableUtil.hasFeature(ch, SyllableUtil.JOSA1)) {
+ if (SyllableUtil.hasFeature(ch, SyllableUtil.JOSA1)) {
candidates.addAll(anlysisWithJosa(input.substring(wStart), i-wStart));
+ }
- if(f[SyllableUtil.IDX_EOGAN]=='1')
+ if (SyllableUtil.hasFeature(ch, SyllableUtil.EOGAN)) {
candidates.addAll(anlysisWithEomi(input.substring(wStart), i-wStart));
+ }
}
// íë³´ê° ë ê°ë¥ì±ì´ ëì ìì¼ë¡ ì ë ¬íë¤.
@@ -156,25 +159,20 @@ public class WordSpaceAnalyzer {
if(jend==-1) return candidates; // íë¹í ì¡°ì¬ê° ìëë¼ë©´...
String input = snippet.substring(0,jend);
-
- boolean josaFlag = true;
- for(int i=input.length()-1;i>0;i--) {
-
+ for (int i = input.length()-1; i > 0; i--) {
String stem = input.substring(0,i);
-
String josa = input.substring(i);
- char[] feature = SyllableUtil.getFeature(josa.charAt(0));
+ char ch = josa.charAt(0);
- if(josaFlag&&feature[SyllableUtil.IDX_JOSA1]=='1') {
- morphAnal.analysisWithJosa(stem,josa,candidates);
+ if (SyllableUtil.hasFeature(ch, SyllableUtil.JOSA1)) {
+ morphAnal.analysisWithJosa(stem, josa, candidates);
}
-
- if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false;
-
- if(!josaFlag) break;
+ if (!SyllableUtil.hasFeature(ch, SyllableUtil.JOSA2)) {
+ break;
+ }
}
if(input.length()==1) {
@@ -209,9 +207,10 @@ public class WordSpaceAnalyzer {
}
// ì¡°ì¬ì 2ìì ë¡ ì¬ì©ë ì ë§ì§ë§ ìì ì ì°¾ëë¤.
- for(int i=jstart+1;i<snippet.length();i++) {
- char[] f = SyllableUtil.getFeature(snippet.charAt(i));
- if(f[SyllableUtil.IDX_JOSA2]=='0') break;
+ for (int i = jstart+1; i < snippet.length(); i++) {
+ if (!SyllableUtil.hasFeature(snippet.charAt(i), SyllableUtil.JOSA2)) {
+ break;
+ }
jend = i;
}
@@ -364,36 +363,27 @@ public class WordSpaceAnalyzer {
return candidates;
}
- private void anlysisWithEomiDetail(String input, List<AnalysisOutput> candidates )
- {
-
- boolean eomiFlag = true;
-
+ private void anlysisWithEomiDetail(String input, List<AnalysisOutput> candidates) {
int strlen = input.length();
char ch = input.charAt(strlen-1);
- char[] feature = SyllableUtil.getFeature(ch);
- if(feature[SyllableUtil.IDX_YNPNA]=='1'||feature[SyllableUtil.IDX_YNPLA]=='1'||
- feature[SyllableUtil.IDX_YNPMA]=='1')
+ if (SyllableUtil.hasFeature(ch, SyllableUtil.YNPNA) ||
+ SyllableUtil.hasFeature(ch, SyllableUtil.YNPLA) ||
+ SyllableUtil.hasFeature(ch, SyllableUtil.YNPMA)) {
morphAnal.analysisWithEomi(input,"",candidates);
+ }
- for(int i=strlen-1;i>0;i--) {
-
+ for (int i = strlen-1; i > 0; i--) {
String stem = input.substring(0,i);
String eomi = input.substring(i);
- feature = SyllableUtil.getFeature(eomi.charAt(0));
-
- if(eomiFlag) {
- morphAnal.analysisWithEomi(stem,eomi,candidates);
- }
+ morphAnal.analysisWithEomi(stem,eomi,candidates);
- if(eomiFlag&&feature[SyllableUtil.IDX_EOMI2]=='0') eomiFlag = false;
-
- if(!eomiFlag) break;
+ if (!SyllableUtil.hasFeature(eomi.charAt(0), SyllableUtil.EOMI2)) {
+ break;
+ }
}
-
}
/**
@@ -420,9 +410,10 @@ public class WordSpaceAnalyzer {
// ì¡°ì¬ì 2ìì ë¡ ì¬ì©ë ì ë§ì§ë§ ìì ì ì°¾ëë¤.
int start = 0;
- for(int i=1;i<tail.length();i++) {
- char[] f = SyllableUtil.getFeature(tail.charAt(i));
- if(f[SyllableUtil.IDX_EOGAN]=='0') break;
+ for (int i = 1; i < tail.length(); i++) {
+ if (!SyllableUtil.hasFeature(tail.charAt(i), SyllableUtil.EOGAN)) {
+ break;
+ }
start = i;
}
@@ -469,7 +460,7 @@ public class WordSpaceAnalyzer {
int nEnd = output.getLastEnd()+o.getSource().length();
- char[] f = nEnd<input.length() ? SyllableUtil.getFeature(input.charAt(nEnd)) : null;
+ boolean hasJOSA1 = nEnd < input.length() ? SyllableUtil.hasFeature(input.charAt(nEnd), SyllableUtil.JOSA1) : false;
// ë°¥ë¨¹ê³ ê°ì ê²½ì°ê° ê°ë¥íë.. ë¨¹ê³ ë ëª
ì¬ê° ìëë¤.
if(po!=null&&po.getPatn()==PatternConstants.PTN_N&&candidates.size()>0&&
@@ -485,7 +476,7 @@ public class WordSpaceAnalyzer {
if(o.getPos()==PatternConstants.POS_NOUN && MorphUtil.hasVerbOnly(o.getStem())) {
output.removeLast();
return -1;
- }else if(nEnd<input.length() && f[SyllableUtil.IDX_JOSA1]=='1'
+ }else if(nEnd<input.length() && hasJOSA1
&& DictionaryUtil.getNoun(o.getSource())!=null) {
return -1;
}else if(nEnd<input.length() && o.getScore()==AnalysisOutput.SCORE_ANALYSIS
@@ -556,13 +547,13 @@ public class WordSpaceAnalyzer {
* @return if founded
*/
private boolean findNounWithinStr(String str, int ws, int es) {
-
- if(str.length()<es) return false;
+ if (str.length() < es) {
+ return false;
+ }
- for(int i=es;i<str.length();i++) {
- char[] f = SyllableUtil.getFeature(str.charAt(i));
- if(i==str.length() || (f[SyllableUtil.IDX_JOSA1]=='1')) {
- return (DictionaryUtil.getWord(str.substring(ws,i))!=null);
+ for (int i = es; i < str.length(); i++) {
+ if (SyllableUtil.hasFeature(str.charAt(i), SyllableUtil.JOSA1)) {
+ return DictionaryUtil.getWord(str.substring(ws,i)) != null;
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/EomiUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/EomiUtil.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/EomiUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/EomiUtil.java Sat Oct 19 19:54:37 2013
@@ -147,22 +147,18 @@ public class EomiUtil {
}
public static boolean IsNLMBSyl(char ech, char lch) {
-
- char[] features = SyllableUtil.getFeature(ech);
-
switch(lch) {
-
- case 'ã´' :
- return (features[SyllableUtil.IDX_YNPNA]=='1' || features[SyllableUtil.IDX_YNPLN]=='1');
- case 'ã¹' :
- return (features[SyllableUtil.IDX_YNPLA]=='1');
- case 'ã
' :
- return (features[SyllableUtil.IDX_YNPMA]=='1');
- case 'ã
' :
- return (features[SyllableUtil.IDX_YNPBA]=='1');
+ case 'ã´' :
+ return SyllableUtil.hasFeature(ech, SyllableUtil.YNPNA) || SyllableUtil.hasFeature(ech, SyllableUtil.YNPLN);
+ case 'ã¹' :
+ return SyllableUtil.hasFeature(ech, SyllableUtil.YNPLA);
+ case 'ã
' :
+ return SyllableUtil.hasFeature(ech, SyllableUtil.YNPMA);
+ case 'ã
' :
+ return SyllableUtil.hasFeature(ech, SyllableUtil.YNPBA);
+ default:
+ return false;
}
-
- return false;
}
/**
@@ -213,7 +209,7 @@ public class EomiUtil {
}
else if(chrs[0]!='ã
'&&
(chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
'||chrs[1]=='ã
')&&
- (chrs.length==2 || SyllableUtil.getFeature(estem)[SyllableUtil.IDX_YNPAH]=='1')&&
+ (chrs.length==2 || SyllableUtil.hasFeature(estem, SyllableUtil.YNPAH)) &&
(DictionaryUtil.combineAndEomiCheck('ì´', end)!=null))
{
strs[0] = stem;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/MorphUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/MorphUtil.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/MorphUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/MorphUtil.java Sat Oct 19 19:54:37 2013
@@ -136,10 +136,11 @@ public class MorphUtil {
}
public static boolean hasVerbOnly(String input) {
-
- for(int i=input.length()-1;i>=0;i--) {
- char[] feature = SyllableUtil.getFeature(input.charAt(i));
- if(feature[SyllableUtil.IDX_WDSURF]=='1'&&input.length()>i) return true;
+ for (int i = input.length()-1; i >=0; i--) {
+ if (SyllableUtil.hasFeature(input.charAt(i), SyllableUtil.WDSURF)) {
+ assert input.length() > i;
+ return true;
+ }
}
return false;
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/NounUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/NounUtil.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/NounUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/NounUtil.java Sat Oct 19 19:54:37 2013
@@ -242,20 +242,15 @@ public class NounUtil {
}
public static boolean endsWith2Josa(String input) {
-
- boolean josaFlag = true;
- for(int i=input.length()-2;i>0;i--) {
-
+ for (int i = input.length()-2; i > 0; i--) {
String josa = input.substring(i);
- char[] feature = SyllableUtil.getFeature(josa.charAt(0));
- if(josaFlag&&DictionaryUtil.existJosa(josa)) return true;
-
-
- if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false;
- if(!josaFlag) break;
+ if (DictionaryUtil.existJosa(josa)) {
+ return true;
+ } else if (!SyllableUtil.hasFeature(josa.charAt(0), SyllableUtil.JOSA2)) {
+ return false;
+ }
}
-
return false;
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/SyllableUtil.java Sat Oct 19 19:54:37 2013
@@ -18,99 +18,69 @@ package org.apache.lucene.analysis.ko.ut
*/
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
+import java.io.InputStream;
import org.apache.lucene.analysis.ko.dic.DictionaryResources;
-import org.apache.lucene.analysis.ko.dic.LineProcessor;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
public class SyllableUtil {
private SyllableUtil() {}
- public static int IDX_JOSA1 = 0; // ì¡°ì¬ì 첫ìì ë¡ ì¬ì©ëë ìì 49ê°
- public static int IDX_JOSA2 = 1; // ì¡°ì¬ì ë ë²ì§¸ ì´ìì ìì ë¡ ì¬ì©ëë ìì 58ê°
- public static int IDX_EOMI1 = 2; // ì´ë¯¸ì 첫ìì ë¡ ì¬ì©ëë ìì 72ê°
- public static int IDX_EOMI2 = 3; // ì´ë¯¸ì ë ë²ì§¸ ì´ìì ìì ë¡ ì¬ì©ëë ìì 105ê°
- public static int IDX_YONG1 = 4; // 1ìì ì©ì¸ì ì¬ì©ëë ìì 362ê°
- public static int IDX_YONG2 = 5; // 2ìì ì©ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 316ê°
- public static int IDX_YONG3 = 6; // 3ìì ì´ì ì©ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 195ê°
- public static int IDX_CHEON1 = 7; // 1ìì ì²´ì¸ì ì¬ì©ëë ìì 680ê°
- public static int IDX_CHEON2 = 8; // 2ìì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 916ê°
- public static int IDX_CHEON3 = 9; // 3ìì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 800ê°
- public static int IDX_CHEON4 = 10; // 4ìì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 610ê°
- public static int IDX_CHEON5 = 11; // 5ìì ì´ì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 330ê°
- public static int IDX_BUSA1 = 12; // 1ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 191ê°
- public static int IDX_BUSA2 = 13; // 2ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 519ê°
- public static int IDX_BUSA3 = 14; // 3ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 139ê°
- public static int IDX_BUSA4 = 15; // 4ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 366ê°
- public static int IDX_BUSA5 = 16; // 5ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 79ê°
- public static int IDX_PRONOUN = 17; // ëëª
ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 77ê°
- public static int IDX_EXCLAM = 18; // ê´íì¬ì ê°íì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 241ê°
-
- public static int IDX_YNPNA = 19; // (ì©ì¸+'-ã´')ì ìíì¬ ìì±ëë ìì 129ê°
- public static int IDX_YNPLA = 20; // (ì©ì¸+'-ã¹')ì ìí´ ìì±ëë ìì 129ê°
- public static int IDX_YNPMA = 21; // (ì©ì¸+'-ã
')ì ìí´ ìì±ëë ìì 129ê°
- public static int IDX_YNPBA = 22; // (ì©ì¸+'-ã
')ì ìí´ ìì±ëë ìì 129ê°
- public static int IDX_YNPAH = 23; // 모ìì¼ë¡ ëëë ìì 129ê°ì¤ 'ã
/ã
/ã
/ã
/ã
'ë¡ ëëë ê²ì´ ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
- public static int IDX_YNPOU = 24; // 모ì 'ã
/ã
'ë¡ ëëë ìì ì´ 'ì/ì´'ë¡ ììëë ì´ë¯¸ë ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
- public static int IDX_YNPEI = 25; // 모ì 'ã
£'ë¡ ëëë ì©ì¸ì´ 'ì/ì´'ë¡ ììëë ì´ë¯¸ë ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
- public static int IDX_YNPOI = 26; // 모ì 'ã
'ë¡ ëëë ì©ì¸ì´ 'ì/ì´'ë¡ ììëë ì´ë¯¸ë ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
- public static int IDX_YNPLN = 27; // ë°ì¹¨ 'ã¹'ë¡ ëëë ì©ì¸ì´ ì´ë¯¸ '-ã´'ê³¼ ê²°í©í ë ìì±ëë ìì
- public static int IDX_IRRLO = 28; // 'ë¬' ë¶ê·ì¹(8ê°)ì ìíì¬ ìì±ëë ìì : ë¬, ë
- public static int IDX_IRRPLE = 29; // '르' ë¶ê·ì¹(193ê°)ì ìíì¬ ìì±ëë ìì
- public static int IDX_IRROO = 30; // 'ì°' ë¶ê·ì¹ì ìíì¬ ìì±ëë ìì : í¼, í
- public static int IDX_IRROU = 31; // 'ì´' ë¶ê·ì¹ì ìíì¬ ìì±ëë ìì : í´, í
- public static int IDX_IRRDA = 32; // 'ã·' ë¶ê·ì¹(37ê°)ì ìíì¬ ìì±ëë ìì
- public static int IDX_IRRBA = 33; // 'ã
' ë¶ê·ì¹(446ê°)ì ìíì¬ ìì±ëë ìì
- public static int IDX_IRRSA = 34; // 'ã
' ë¶ê·ì¹(39ê°)ì ìíì¬ ìì±ëë ìì
- public static int IDX_IRRHA = 35; // 'ã
' ë¶ê·ì¹(96ê°)ì ìíì¬ ìì±ëë ìì
- public static int IDX_PEND = 36; // ì ì´ë§ ì´ë¯¸ : ì ì
¨ ì ì ì ê²
-
- public static int IDX_YNPEOMI = 37; // ì©ì¸ì´ ì´ë¯¸ì ê²°í©í ë ìì±ëë ìì ì ì 734ê°
-
- /** ì©ì¸ì í층 ííë¡ë§ ì¬ì©ëë ìì */
- public static int IDX_WDSURF = 38;
+ /** ì¡°ì¬ì 첫ìì ë¡ ì¬ì©ëë ìì 49ê° */
+ public static int JOSA1 = 0;
+ /** ì¡°ì¬ì ë ë²ì§¸ ì´ìì ìì ë¡ ì¬ì©ëë ìì 58ê° */
+ public static int JOSA2 = 1;
+ /** ì´ë¯¸ì ë ë²ì§¸ ì´ìì ìì ë¡ ì¬ì©ëë ìì 105ê° */
+ public static int EOMI2 = 2;
+ /** (ì©ì¸+'-ã´')ì ìíì¬ ìì±ëë ìì 129ê° */
+ public static int YNPNA = 3;
+ /** (ì©ì¸+'-ã¹')ì ìí´ ìì±ëë ìì 129ê° */
+ public static int YNPLA = 4;
+ /** (ì©ì¸+'-ã
')ì ìí´ ìì±ëë ìì 129ê° */
+ public static int YNPMA = 5;
+ /** (ì©ì¸+'-ã
')ì ìí´ ìì±ëë ìì 129ê° */
+ public static int YNPBA = 6;
+ /** 모ìì¼ë¡ ëëë ìì 129ê°ì¤ 'ã
/ã
/ã
/ã
/ã
'ë¡ ëëë ê²ì´ ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì */
+ public static int YNPAH = 7;
+ /** ë°ì¹¨ 'ã¹'ë¡ ëëë ì©ì¸ì´ ì´ë¯¸ '-ã´'ê³¼ ê²°í©í ë ìì±ëë ìì */
+ public static int YNPLN = 8;
+ /** ì©ì¸ì í층 ííë¡ë§ ì¬ì©ëë ìì */
+ public static int WDSURF = 9;
+ /** ì´ë¯¸ ëë ì´ë¯¸ì ë³íì¼ë¡ ì¡´ì¬í ì ìë ì (ì¦ IDX_EOMI ì´ê±°ë IDX_YNPNA ì´íì 1ì´ ìë ìì ) */
+ public static int EOGAN = 10;
+
+ private static final int NUM_FEATURES = 11;
+ private static final int HANGUL_START = 0xAC00;
+ private static final int HANGUL_END = 0xD7AF;
- public static int IDX_EOGAN = 39; // ì´ë¯¸ ëë ì´ë¯¸ì ë³íì¼ë¡ ì¡´ì¬í ì ìë ì (ì¦ IDX_EOMI ì´ê±°ë IDX_YNPNA ì´íì 1ì´ ìë ìì )
-
- private static final List<char[]> syllables; // ìì í¹ì± ì ë³´
+ private static final FixedBitSet features;
static {
- try{
- final List<char[]> list = new ArrayList<char[]>();
- DictionaryResources.readLines(DictionaryResources.FILE_SYLLABLE_FEATURE, new LineProcessor() {
- @Override
- public void processLine(String line) throws IOException {
- list.add(line.toCharArray());
- }
- });
- syllables = Collections.unmodifiableList(list);
- } catch(IOException ioe) {
+ InputStream stream = null;
+ try {
+ stream = DictionaryResources.class.getResourceAsStream(DictionaryResources.FILE_SYLLABLE_DAT);
+ DataInput dat = new InputStreamDataInput(stream);
+ CodecUtil.checkHeader(dat, DictionaryResources.FILE_SYLLABLE_DAT, DictionaryResources.DATA_VERSION, DictionaryResources.DATA_VERSION);
+ long bits[] = new long[dat.readVInt()];
+ for (int i = 0; i < bits.length; i++) {
+ bits[i] = dat.readLong();
+ }
+ features = new FixedBitSet(bits, (1 + HANGUL_END - HANGUL_START) * NUM_FEATURES);
+ } catch (IOException ioe) {
throw new Error("Cannot load ressource", ioe);
- }
- }
-
- /**
- * ì¸ë±ì¤ ê°ì í´ë¹íë ìì ì í¹ì±ì ë°ííë¤.
- * ìì ëë ì«ìì¼ ê²½ì°ë 모ë í´ë¹ì´ ìëë¯ë¡ ê°ì¥ ë§ì§ë§ ê¸ìì¸ 'í£' ì ìì í¹ì±ì ë°ííë¤.
- *
- * @param idx 'ê°'(0xAC00)ì´ 0ë¶í° ì ëì½ëì ìí´ íê¸ìì ì ìì°¨ì ì¼ë¡ ëì´í ê°
- */
- public static char[] getFeature(int idx) {
- if(idx<0||idx>=syllables.size())
- return syllables.get(syllables.size()-1);
- else
- return syllables.get(idx);
+ } finally {
+ IOUtils.closeWhileHandlingException(stream);
+ }
}
- /**
- * ê° ìì ì í¹ì±ì ë°ííë¤.
- * @param syl ìì íë
- */
- public static char[] getFeature(char syl) {
-
- int idx = syl - 0xAC00;
- return getFeature(idx);
-
+ /** Returns true if the syllable has the specified feature */
+ public static boolean hasFeature(char syl, int feature) {
+ if (syl < HANGUL_START || syl > HANGUL_END) {
+ return false; // outside of hangul syllable range
+ } else {
+ return features.get((syl - HANGUL_START) * NUM_FEATURES + feature);
+ }
}
}
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/syllable.dat?rev=1533813&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestSyllableUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestSyllableUtil.java?rev=1533813&view=auto
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestSyllableUtil.java (added)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/test/org/apache/lucene/analysis/ko/utils/TestSyllableUtil.java Sat Oct 19 19:54:37 2013
@@ -0,0 +1,99 @@
+package org.apache.lucene.analysis.ko.utils;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.hasFeature;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.JOSA1;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.JOSA2;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.EOMI2;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.YNPNA;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.YNPLA;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.YNPMA;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.YNPBA;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.YNPAH;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.YNPLN;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.WDSURF;
+import static org.apache.lucene.analysis.ko.utils.SyllableUtil.EOGAN;
+
+public class TestSyllableUtil extends LuceneTestCase {
+
+ public void testGa() {
+ assertTrue(hasFeature('ê°', JOSA1));
+ assertTrue(hasFeature('ê°', JOSA2));
+ assertTrue(hasFeature('ê°', EOMI2));
+ assertFalse(hasFeature('ê°', YNPNA));
+ assertFalse(hasFeature('ê°', YNPLA));
+ assertFalse(hasFeature('ê°', YNPMA));
+ assertFalse(hasFeature('ê°', YNPBA));
+ assertFalse(hasFeature('ê°', YNPAH));
+ assertFalse(hasFeature('ê°', YNPLN));
+ assertFalse(hasFeature('ê°', WDSURF));
+ assertTrue(hasFeature('ê°', EOGAN));
+ }
+
+ public void testGagg() {
+ assertNoFeatures('ê°');
+ }
+
+ public void testGan() {
+ assertFalse(hasFeature('ê°', JOSA1));
+ assertFalse(hasFeature('ê°', JOSA2));
+ assertTrue(hasFeature('ê°', EOMI2));
+ assertTrue(hasFeature('ê°', YNPNA));
+ assertFalse(hasFeature('ê°', YNPLA));
+ assertFalse(hasFeature('ê°', YNPMA));
+ assertFalse(hasFeature('ê°', YNPBA));
+ assertFalse(hasFeature('ê°', YNPAH));
+ assertTrue(hasFeature('ê°', YNPLN));
+ assertFalse(hasFeature('ê°', WDSURF));
+ assertTrue(hasFeature('ê°', EOGAN));
+ }
+
+ public void testSomeFeatures() {
+ assertTrue(hasFeature('ê°', WDSURF));
+ assertTrue(hasFeature('ê°', YNPAH));
+ assertTrue(hasFeature('ê°', YNPBA));
+ assertTrue(hasFeature('ê°', YNPMA));
+ assertTrue(hasFeature('ê°', YNPLA));
+ }
+
+ public void testOutOfBounds() {
+ for (int i = 0; i < 0xAC00; i++) {
+ assertNoFeatures((char)i);
+ }
+ for (int i = 0xD7B0; i <= 0xFFFF; i++) {
+ assertNoFeatures((char)i);
+ }
+ }
+
+ private void assertNoFeatures(char ch) {
+ assertFalse(hasFeature(ch, JOSA1));
+ assertFalse(hasFeature(ch, JOSA2));
+ assertFalse(hasFeature(ch, EOMI2));
+ assertFalse(hasFeature(ch, YNPNA));
+ assertFalse(hasFeature(ch, YNPLA));
+ assertFalse(hasFeature(ch, YNPMA));
+ assertFalse(hasFeature(ch, YNPBA));
+ assertFalse(hasFeature(ch, YNPAH));
+ assertFalse(hasFeature(ch, YNPLN));
+ assertFalse(hasFeature(ch, WDSURF));
+ assertFalse(hasFeature(ch, EOGAN));
+ }
+}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1533813&r1=1533812&r2=1533813&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Sat Oct 19 19:54:37 2013
@@ -30,6 +30,7 @@ import org.apache.lucene.codecs.CodecUti
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
@@ -44,7 +45,6 @@ public class DictionaryBuilder {
DictionaryResources.FILE_JOSA,
DictionaryResources.FILE_PREFIX,
DictionaryResources.FILE_SUFFIX,
- DictionaryResources.FILE_SYLLABLE_FEATURE,
DictionaryResources.FILE_UNCOMPOUNDS
};
@@ -56,6 +56,7 @@ public class DictionaryBuilder {
copyAsIs(in, out);
}
buildHanjaMap(inputDir, outputDir);
+ buildSyllableDict(inputDir, outputDir);
}
static void copyAsIs(File in, File out) throws Exception {
@@ -120,4 +121,96 @@ public class DictionaryBuilder {
idxStream.close();
datStream.close();
}
+
+ static void buildSyllableDict(File inputDir, File outputDir) throws Exception {
+ // Syllable features by index:
+ // 0: JOSA1: ì¡°ì¬ì 첫ìì ë¡ ì¬ì©ëë ìì 49ê°
+ // 1: JOSA2: ì¡°ì¬ì ë ë²ì§¸ ì´ìì ìì ë¡ ì¬ì©ëë ìì 58ê°
+ // 2: EOMI1: ì´ë¯¸ì 첫ìì ë¡ ì¬ì©ëë ìì 72ê°
+ // 3: EOMI2: ì´ë¯¸ì ë ë²ì§¸ ì´ìì ìì ë¡ ì¬ì©ëë ìì 105ê°
+ // 4: YONG1: 1ìì ì©ì¸ì ì¬ì©ëë ìì 362ê°
+ // 5: YONG2: 2ìì ì©ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 316ê°
+ // 6: YONG3: 3ìì ì´ì ì©ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 195ê°
+ // 7: CHEON1: 1ìì ì²´ì¸ì ì¬ì©ëë ìì 680ê°
+ // 8: CHEON2: 2ìì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 916ê°
+ // 9: CHEON3: 3ìì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 800ê°
+ // 10: CHEON4: 4ìì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 610ê°
+ // 11: CHEON5: 5ìì ì´ì ì²´ì¸ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 330ê°
+ // 12: BUSA1: 1ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 191ê°
+ // 13: BUSA2: 2ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 519ê°
+ // 14: BUSA3: 3ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 139ê°
+ // 15: BUSA4: 4ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 366ê°
+ // 16: BUSA5: 5ìì ë¶ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 79ê°
+ // 17: PRONOUN: ëëª
ì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 77ê°
+ // 18: EXCLAM: ê´íì¬ì ê°íì¬ì ë§ì§ë§ ìì ë¡ ì¬ì©ëë ìì 241ê°
+ // 19: YNPNA: (ì©ì¸+'-ã´')ì ìíì¬ ìì±ëë ìì 129ê°
+ // 20: YNPLA: (ì©ì¸+'-ã¹')ì ìí´ ìì±ëë ìì 129ê°
+ // 21: YNPMA: (ì©ì¸+'-ã
')ì ìí´ ìì±ëë ìì 129ê°
+ // 22: YNPBA: (ì©ì¸+'-ã
')ì ìí´ ìì±ëë ìì 129ê°
+ // 23: YNPAH: 모ìì¼ë¡ ëëë ìì 129ê°ì¤ 'ã
/ã
/ã
/ã
/ã
'ë¡ ëëë ê²ì´ ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
+ // 24: YNPOU: 모ì 'ã
/ã
'ë¡ ëëë ìì ì´ 'ì/ì´'ë¡ ììëë ì´ë¯¸ë ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
+ // 25: YNPEI: 모ì 'ã
£'ë¡ ëëë ì©ì¸ì´ 'ì/ì´'ë¡ ììëë ì´ë¯¸ë ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
+ // 26: YNPOI: 모ì 'ã
'ë¡ ëëë ì©ì¸ì´ 'ì/ì´'ë¡ ììëë ì´ë¯¸ë ì ì´ë§ ì´ë¯¸ '-ì-'ê³¼ ê²°í©í ë ìì±ëë ìì
+ // 27: YNPLN: ë°ì¹¨ 'ã¹'ë¡ ëëë ì©ì¸ì´ ì´ë¯¸ '-ã´'ê³¼ ê²°í©í ë ìì±ëë ìì
+ // 28: IRRLO: 'ë¬' ë¶ê·ì¹(8ê°)ì ìíì¬ ìì±ëë ìì : ë¬, ë
+ // 29: IRRPLE: '르' ë¶ê·ì¹(193ê°)ì ìíì¬ ìì±ëë ìì
+ // 30: IRROO: 'ì°' ë¶ê·ì¹ì ìíì¬ ìì±ëë ìì : í¼, í
+ // 31: IRROU: 'ì´' ë¶ê·ì¹ì ìíì¬ ìì±ëë ìì : í´, í
+ // 32: IRRDA: 'ã·' ë¶ê·ì¹(37ê°)ì ìíì¬ ìì±ëë ìì
+ // 33: IRRBA: 'ã
' ë¶ê·ì¹(446ê°)ì ìíì¬ ìì±ëë ìì
+ // 34: IRRSA: 'ã
' ë¶ê·ì¹(39ê°)ì ìíì¬ ìì±ëë ìì
+ // 35: IRRHA: 'ã
' ë¶ê·ì¹(96ê°)ì ìíì¬ ìì±ëë ìì
+ // 36: PEND: ì ì´ë§ ì´ë¯¸ : ì ì
¨ ì ì ì ê²
+ // 37: YNPEOMI: ì©ì¸ì´ ì´ë¯¸ì ê²°í©í ë ìì±ëë ìì ì ì 734ê°
+ // 38: WD_SURF: ì©ì¸ì í층 ííë¡ë§ ì¬ì©ëë ìì
+ // 39: EOGAN: ì´ë¯¸ ëë ì´ë¯¸ì ë³íì¼ë¡ ì¡´ì¬í ì ìë ì (ì¦ IDX_EOMI ì´ê±°ë IDX_YNPNA ì´íì 1ì´ ìë ìì )
+
+ OutputStream stream = new BufferedOutputStream(new FileOutputStream(new File(outputDir, DictionaryResources.FILE_SYLLABLE_DAT)));
+ DataOutput out = new OutputStreamDataOutput(stream);
+ CodecUtil.writeHeader(out, DictionaryResources.FILE_SYLLABLE_DAT, DictionaryResources.DATA_VERSION);
+
+ int numBits = (1 + 0xD7AF - 0xAC00) * 11;
+ FixedBitSet features = new FixedBitSet(numBits);
+ int idx = 0;
+
+ // (AC00-D7AF)
+ File input = new File(inputDir, "syllable.dic");
+ BufferedReader reader = new BufferedReader(IOUtils.getDecodingReader(input, IOUtils.CHARSET_UTF_8));
+ String line = null;
+ int last = 0xABFF;
+ while ((line = reader.readLine()) != null) {
+ if (!line.startsWith("!") && !line.startsWith("\uFEFF")) {
+ // validate (using the comments!)
+ final int ch;
+ String currentChar = line.substring(43);
+ if (currentChar.length() == 1) {
+ ch = currentChar.charAt(0);
+ } else {
+ ch = Integer.parseInt(currentChar, 16);
+ }
+ assert ch == last + 1;
+ last = ch;
+ // set feature bits
+ if (line.charAt(0) == '1') features.set(idx); idx++;
+ if (line.charAt(1) == '1') features.set(idx); idx++;
+ if (line.charAt(3) == '1') features.set(idx); idx++;
+ if (line.charAt(19) == '1') features.set(idx); idx++;
+ if (line.charAt(20) == '1') features.set(idx); idx++;
+ if (line.charAt(21) == '1') features.set(idx); idx++;
+ if (line.charAt(22) == '1') features.set(idx); idx++;
+ if (line.charAt(23) == '1') features.set(idx); idx++;
+ if (line.charAt(27) == '1') features.set(idx); idx++;
+ if (line.charAt(38) == '1') features.set(idx); idx++;
+ if (line.charAt(39) == '1') features.set(idx); idx++;
+ }
+ }
+ assert idx == numBits;
+ long raw[] = features.getBits();
+ out.writeVInt(raw.length);
+ for (int i = 0; i < raw.length; i++) {
+ out.writeLong(raw[i]);
+ }
+ reader.close();
+ stream.close();
+ }
}