You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 14:31:34 UTC
svn commit: r1534128 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/morph/
java/org/apache/lucene/analysis/ko/utils/ resou...
Author: rmuir
Date: Mon Oct 21 12:31:34 2013
New Revision: 1534128
URL: http://svn.apache.org/r1534128
Log:
LUCENE-4956: remove trie
Removed:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/dictionary.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Mon Oct 21 12:31:34 2013
@@ -312,9 +312,9 @@ public final class KoreanFilter extends
sb.append(chs[k]);
if(k>0) candiList.add(sb);
- Iterator<String[]> iter = DictionaryUtil.findWithPrefix(sb.toString());
- if(!iter.hasNext()) // ì¬ì ì ìì¼ë©´ ìì íë³´
- removeList.add(sb);
+ if (!DictionaryUtil.hasWordPrefix(sb)) {
+ removeList.add(sb); // ì¬ì ì ìì¼ë©´ ìì íë³´
+ }
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Mon Oct 21 12:31:34 2013
@@ -30,20 +30,14 @@ import org.apache.lucene.util.IOUtils;
*/
public class DictionaryResources {
- public static final String FILE_DICTIONARY = "dictionary.dic";
-
public static final String FILE_JOSA = "josa.dic";
public static final String FILE_EOMI = "eomi.dic";
- public static final String FILE_EXTENSION = "extension.dic";
-
public static final String FILE_PREFIX = "prefix.dic";
public static final String FILE_SUFFIX = "suffix.dic";
- public static final String FILE_COMPOUNDS = "compounds.dic";
-
public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
public static final String FILE_SYLLABLE_DAT = "syllable.dat";
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 12:31:34 2013
@@ -20,13 +20,9 @@ package org.apache.lucene.analysis.ko.di
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
import java.util.Set;
-import org.apache.lucene.analysis.ko.utils.Trie;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
@@ -35,9 +31,7 @@ import org.apache.lucene.util.fst.FST;
public class DictionaryUtil {
private DictionaryUtil() {}
- private static final Trie<String,WordEntry> dictionary = new Trie<String, WordEntry>(false);
-
- private static final HangulDictionary newDictionary;
+ private static final HangulDictionary dictionary;
private static final Set<String> josas = new HashSet<String>();
@@ -51,41 +45,6 @@ public class DictionaryUtil {
static {
try {
- final LineProcessor proc = new LineProcessor() {
- @Override
- public void processLine(String line) throws IOException {
- String[] infos = line.split("[,]+");
- if (infos.length != 2) {
- throw new IOException("Invalid file format: " + line);
- }
- if (infos[1].length() != 10) {
- throw new IOException("Invalid file format: " + line);
- }
-
- WordEntry entry = new WordEntry(infos[0].trim(), parseFlags(infos[1]), null);
- dictionary.add(entry.getWord(), entry);
- }
- };
- DictionaryResources.readLines(DictionaryResources.FILE_DICTIONARY, proc);
- DictionaryResources.readLines(DictionaryResources.FILE_EXTENSION, proc);
-
- DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS, new LineProcessor() {
- @Override
- public void processLine(String compound) throws IOException {
- String[] infos = compound.split("[:]+");
- if (infos.length != 3) {
- throw new IOException("Invalid file format: " + compound);
- }
- if (infos[2].length() != 4) {
- throw new IOException("Illegal file format: " + compound);
- }
-
- final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
- final WordEntry entry = new WordEntry(infos[0].trim(), parseFlags("200"+infos[2]+"00X"), c);
- dictionary.add(entry.getWord(), entry);
- }
- });
-
DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS, new LineProcessor() {
@Override
public void processLine(String compound) throws IOException {
@@ -112,49 +71,48 @@ public class DictionaryUtil {
dat.readBytes(metadata, 0, metadata.length);
ByteOutputs outputs = ByteOutputs.getSingleton();
FST<Byte> fst = new FST<Byte>(dat, outputs);
- newDictionary = new HangulDictionary(fst, metadata);
+ dictionary = new HangulDictionary(fst, metadata);
stream.close();
} catch (IOException e) {
throw new Error("Cannot load resource",e);
}
}
-
- @SuppressWarnings({"rawtypes","unchecked"})
- public static Iterator<String[]> findWithPrefix(String prefix) {
- return dictionary.getPrefixedBy(prefix);
+
+ public static boolean hasWordPrefix(CharSequence prefix) {
+ return dictionary.hasPrefix(prefix);
}
/** only use this if you surely need the whole entry */
public static WordEntry getWord(String key) {
- Byte b = newDictionary.lookup(key);
+ Byte b = dictionary.lookup(key);
if (b == null) {
return null;
} else {
- return newDictionary.decodeEntry(key, b);
+ return dictionary.decodeEntry(key, b);
}
}
public static WordEntry getWordExceptVerb(String key) {
- Byte b = newDictionary.lookup(key);
+ Byte b = dictionary.lookup(key);
if (b == null) {
return null;
}
- char flags = newDictionary.getFlags(b);
+ char flags = dictionary.getFlags(b);
if ((flags & (WordEntry.NOUN | WordEntry.BUSA)) != 0) {
- return newDictionary.decodeEntry(key, b, flags);
+ return dictionary.decodeEntry(key, b, flags);
} else {
return null;
}
}
public static WordEntry getNoun(String key) {
- Byte b = newDictionary.lookup(key);
+ Byte b = dictionary.lookup(key);
if (b == null) {
return null;
}
- char flags = newDictionary.getFlags(b);
+ char flags = dictionary.getFlags(b);
if ((flags & WordEntry.NOUN) != 0 && (flags & WordEntry.COMPOUND) == 0) {
- return newDictionary.decodeEntry(key, b, flags);
+ return dictionary.decodeEntry(key, b, flags);
} else {
return null;
}
@@ -167,39 +125,39 @@ public class DictionaryUtil {
* @return WordEntry
*/
public static WordEntry getAllNoun(String key) {
- Byte b = newDictionary.lookup(key);
+ Byte b = dictionary.lookup(key);
if (b == null) {
return null;
}
- char flags = newDictionary.getFlags(b);
+ char flags = dictionary.getFlags(b);
if ((flags & WordEntry.NOUN) != 0) {
- return newDictionary.decodeEntry(key, b, flags);
+ return dictionary.decodeEntry(key, b, flags);
} else {
return null;
}
}
public static WordEntry getVerb(String key) {
- Byte b = newDictionary.lookup(key);
+ Byte b = dictionary.lookup(key);
if (b == null) {
return null;
}
- char flags = newDictionary.getFlags(b);
+ char flags = dictionary.getFlags(b);
if ((flags & WordEntry.VERB) != 0) {
- return newDictionary.decodeEntry(key, b, flags);
+ return dictionary.decodeEntry(key, b, flags);
} else {
return null;
}
}
public static WordEntry getBusa(String key) {
- Byte b = newDictionary.lookup(key);
+ Byte b = dictionary.lookup(key);
if (b == null) {
return null;
}
- char flags = newDictionary.getFlags(b);
+ char flags = dictionary.getFlags(b);
if ((flags & WordEntry.BUSA) != 0 && (flags & WordEntry.NOUN) == 0) {
- return newDictionary.decodeEntry(key, b, flags);
+ return dictionary.decodeEntry(key, b, flags);
} else {
return null;
}
@@ -253,70 +211,4 @@ public class DictionaryUtil {
}
});
}
-
- private static List<CompoundEntry> compoundArrayToList(String source, String[] arr) {
- List<CompoundEntry> list = new ArrayList<CompoundEntry>();
- for(String str: arr) {
- list.add(new CompoundEntry(str, true));
- }
- return list;
- }
-
- // TODO: move all this to build time
- private static int parseFlags(String buffer) {
- if (buffer.length() != 10) {
- throw new IllegalArgumentException("Invalid flags: " + buffer);
- }
- int flags = 0;
- // IDX_NOUN: 1 if noun, 2 if compound
- if (buffer.charAt(0) == '2') {
- flags |= WordEntry.COMPOUND | WordEntry.NOUN;
- } else if (buffer.charAt(0) == '1') {
- flags |= WordEntry.NOUN;
- } else if (buffer.charAt(0) != '0') {
- throw new IllegalArgumentException("Invalid flags: " + buffer);
- }
- // IDX_VERB
- if (parseBoolean(buffer, 1)) {
- flags |= WordEntry.VERB;
- }
- // IDX_BUSA
- if (parseBoolean(buffer, 2)) {
- flags |= WordEntry.BUSA;
- }
- // IDX_DOV
- if (parseBoolean(buffer, 3)) {
- flags |= WordEntry.DOV;
- }
- // IDX_BEV
- if (parseBoolean(buffer, 4)) {
- flags |= WordEntry.BEV;
- }
- // IDX_NE
- if (parseBoolean(buffer, 5)) {
- flags |= WordEntry.NE;
- }
- // IDX_REGURA
- switch(buffer.charAt(9)) {
- case 'B': return flags | WordEntry.VERB_TYPE_BIUP;
- case 'H': return flags | WordEntry.VERB_TYPE_HIOOT;
- case 'U': return flags | WordEntry.VERB_TYPE_LIUL;
- case 'L': return flags | WordEntry.VERB_TYPE_LOO;
- case 'S': return flags | WordEntry.VERB_TYPE_SIUT;
- case 'D': return flags | WordEntry.VERB_TYPE_DI;
- case 'R': return flags | WordEntry.VERB_TYPE_RU;
- case 'X': return flags | WordEntry.VERB_TYPE_REGULAR;
- default: throw new IllegalArgumentException("Invalid flags: " + buffer);
- }
- }
-
- private static boolean parseBoolean(String buffer, int position) {
- if (buffer.charAt(position) == '1') {
- return true;
- } else if (buffer.charAt(position) == '0') {
- return false;
- } else {
- throw new IllegalArgumentException("Invalid flags: " + buffer);
- }
- }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 21 12:31:34 2013
@@ -118,4 +118,21 @@ class HangulDictionary {
compounds.add(new CompoundEntry(sb.toString(), true));
return compounds;
}
+
+ boolean hasPrefix(CharSequence key) {
+ final FST.Arc<Byte> arc = fst.getFirstArc(new FST.Arc<Byte>());
+
+ final BytesReader fstReader = fst.getBytesReader();
+
+ for (int i = 0; i < key.length(); i++) {
+ try {
+ if (fst.findTargetArc(key.charAt(i), arc, arc, fstReader) == null) {
+ return false;
+ }
+ } catch (IOException bogus) {
+ throw new RuntimeException();
+ }
+ }
+ return true;
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java Mon Oct 21 12:31:34 2013
@@ -53,7 +53,7 @@ public class WordSpaceAnalyzer {
char ch = input.charAt(i);
String prefix = i==input.length()-1 ? "X" : input.substring(wStart,i+2);
- Iterator<String[]> iter = DictionaryUtil.findWithPrefix(prefix);
+ boolean prefixExists = DictionaryUtil.hasWordPrefix(prefix);
List<AnalysisOutput> candidates = new ArrayList<AnalysisOutput>();
@@ -65,10 +65,10 @@ public class WordSpaceAnalyzer {
// ë¤ì ìì ì´ 2ìì ì´ì ë¨ì´ì í¬í¨ëì´ ìê³ ë§ì§ë§ ìì ì´ ìëë¼ë©´ ëìì°ê¸° ìì¹ê° ìë ê°ë¥ì±ì´ í¬ë¤.
// ë¶ì¬, ê´íì¬, ê°íì¬ ë± ë¨ì¼ì´ì¼ ê°ë¥ì±ì¸ ê²½ì° ëìì°ê¸°ê° ê°ë¥íë,
// ì´ ê²½ì°ë ë¤ì ìì ì ì¡°ì¬íì¬
- } else if(i!= input.length()-1 && iter.hasNext()) {
+ } else if(i!= input.length()-1 && prefixExists) {
// ì무ì§ë íì§ ìì.
sgCount = i;
- } else if(!iter.hasNext() &&
+ } else if(!prefixExists &&
(entry=DictionaryUtil.getBusa(input.substring(wStart,i+1)))!=null) {
candidates.add(buildSingleOutput(entry));
@@ -299,13 +299,15 @@ public class WordSpaceAnalyzer {
// ëì¬ìì ëª
ì¬ë¶ë¦¬
int vstart = 0;
for(int i=estart-1;i>=0;i--) {
- Iterator<String[]> iter = DictionaryUtil.findWithPrefix(snipt.substring(i,estart));
- if(iter.hasNext()) vstart=i;
- else break;
+ if (DictionaryUtil.hasWordPrefix(snipt.substring(i, estart))) {
+ vstart = i;
+ } else {
+ break;
+ }
}
if(snipt.length()>eend &&
- DictionaryUtil.findWithPrefix(snipt.substring(vstart,eend+1)).hasNext())
+ DictionaryUtil.hasWordPrefix(snipt.substring(vstart,eend+1)))
return candidates; // ë¤ììì ê¹ì§ ë¨ì´ì ì¼ë¶ë¼ë©´.. ë¶í´ë¥¼ ìíë¤.
String pvword = null;
@@ -475,7 +477,7 @@ public class WordSpaceAnalyzer {
&& DictionaryUtil.getNoun(o.getSource())!=null) {
return -1;
}else if(nEnd<input.length() && o.getScore()==AnalysisOutput.SCORE_ANALYSIS
- && DictionaryUtil.findWithPrefix(ejend+input.charAt(nEnd)).hasNext()) { // 루ì¬íã´ ê¸ííìë¶ì기 ë°©ì§
+ && DictionaryUtil.hasWordPrefix(ejend+input.charAt(nEnd))) { // 루ì¬íã´ ê¸ííìë¶ì기 ë°©ì§
return -1;
}else if(po!=null&&po.getPatn()==PatternConstants.PTN_VM&&"ã
".equals(po.getEomi())&&
o.getStem().equals("í")) { // ë¤ì§ í©ëë¤ ë¡ ë¶ë¦¬ëë ê² ë°©ì§
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Mon Oct 21 12:31:34 2013
@@ -49,10 +49,7 @@ import org.apache.lucene.util.packed.Pac
public class DictionaryBuilder {
public static void main(String args[]) throws Exception {
String FILES_AS_IS[] = {
- DictionaryResources.FILE_COMPOUNDS,
- DictionaryResources.FILE_DICTIONARY,
DictionaryResources.FILE_EOMI,
- DictionaryResources.FILE_EXTENSION,
DictionaryResources.FILE_JOSA,
DictionaryResources.FILE_PREFIX,
DictionaryResources.FILE_SUFFIX,