You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 08:56:30 UTC
svn commit: r1534040 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic:
DictionaryUtil.java WordEntry.java
Author: rmuir
Date: Mon Oct 21 06:56:30 2013
New Revision: 1534040
URL: http://svn.apache.org/r1534040
Log:
LUCENE-4956: don't hold thousands of arrays in dictionary
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534040&r1=1534039&r2=1534040&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 06:56:30 2013
@@ -19,11 +19,9 @@ package org.apache.lucene.analysis.ko.di
import java.io.IOException;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.ko.utils.Trie;
@@ -56,7 +54,7 @@ public class DictionaryUtil {
throw new IOException("Invalid file format: " + line);
}
- WordEntry entry = new WordEntry(infos[0].trim(),infos[1].toCharArray(), null);
+ WordEntry entry = new WordEntry(infos[0].trim(), parseFlags(infos[1]), null);
dictionary.add(entry.getWord(), entry);
}
};
@@ -75,7 +73,7 @@ public class DictionaryUtil {
}
final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
- final WordEntry entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"00X").toCharArray(), c);
+ final WordEntry entry = new WordEntry(infos[0].trim(), parseFlags("200"+infos[2]+"00X"), c);
dictionary.add(entry.getWord(), entry);
}
});
@@ -222,4 +220,62 @@ public class DictionaryUtil {
}
return list;
}
+
+ // TODO: move all this to build time
+ private static int parseFlags(String buffer) {
+ if (buffer.length() != 10) {
+ throw new IllegalArgumentException("Invalid flags: " + buffer);
+ }
+ int flags = 0;
+ // IDX_NOUN: 1 if noun, 2 if compound
+ if (buffer.charAt(0) == '2') {
+ flags |= WordEntry.COMPOUND | WordEntry.NOUN;
+ } else if (buffer.charAt(0) == '1') {
+ flags |= WordEntry.NOUN;
+ } else if (buffer.charAt(0) != '0') {
+ throw new IllegalArgumentException("Invalid flags: " + buffer);
+ }
+ // IDX_VERB
+ if (parseBoolean(buffer, 1)) {
+ flags |= WordEntry.VERB;
+ }
+ // IDX_BUSA
+ if (parseBoolean(buffer, 2)) {
+ flags |= WordEntry.BUSA;
+ }
+ // IDX_DOV
+ if (parseBoolean(buffer, 3)) {
+ flags |= WordEntry.DOV;
+ }
+ // IDX_BEV
+ if (parseBoolean(buffer, 4)) {
+ flags |= WordEntry.BEV;
+ }
+ // IDX_NE
+ if (parseBoolean(buffer, 5)) {
+ flags |= WordEntry.NE;
+ }
+ // IDX_REGURA
+ switch(buffer.charAt(9)) {
+ case 'B': return flags | WordEntry.VERB_TYPE_BIUP;
+ case 'H': return flags | WordEntry.VERB_TYPE_HIOOT;
+ case 'U': return flags | WordEntry.VERB_TYPE_LIUL;
+ case 'L': return flags | WordEntry.VERB_TYPE_LOO;
+ case 'S': return flags | WordEntry.VERB_TYPE_SIUT;
+ case 'D': return flags | WordEntry.VERB_TYPE_DI;
+ case 'R': return flags | WordEntry.VERB_TYPE_RU;
+ case 'X': return flags | WordEntry.VERB_TYPE_REGULAR;
+ default: throw new IllegalArgumentException("Invalid flags: " + buffer);
+ }
+ }
+
+ private static boolean parseBoolean(String buffer, int position) {
+ if (buffer.charAt(position) == '1') {
+ return true;
+ } else if (buffer.charAt(position) == '0') {
+ return false;
+ } else {
+ throw new IllegalArgumentException("Invalid flags: " + buffer);
+ }
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java?rev=1534040&r1=1534039&r2=1534040&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java Mon Oct 21 06:56:30 2013
@@ -22,37 +22,37 @@ import java.util.List;
public class WordEntry {
- private static final int IDX_NOUN = 0;
- private static final int IDX_VERB = 1;
- private static final int IDX_BUSA = 2;
- private static final int IDX_DOV = 3;
- private static final int IDX_BEV = 4;
- private static final int IDX_NE = 5;
- private static final int IDX_REGURA = 9;
+ static final int NOUN = 1 << 3;
+ static final int VERB = 1 << 4;
+ static final int BUSA = 1 << 5;
+ static final int DOV = 1 << 6;
+ static final int BEV = 1 << 7;
+ static final int NE = 1 << 8;
+ static final int COMPOUND = 1 << 9;
+
+ /** Regular verb type */
+ public static final int VERB_TYPE_REGULAR = 0;
/** Irregular verb type (ã
-final) */
- public static final int VERB_TYPE_BIUP = 'B';
+ public static final int VERB_TYPE_BIUP = 1;
/** Irregular verb type (ã
-final) */
- public static final int VERB_TYPE_HIOOT = 'H';
+ public static final int VERB_TYPE_HIOOT = 2;
/** Irregular verb type (ã¹-final) */
- public static final int VERB_TYPE_LIUL = 'U';
+ public static final int VERB_TYPE_LIUL = 3;
/** Irregular verb type (르-final) */
- public static final int VERB_TYPE_LOO = 'L';
+ public static final int VERB_TYPE_LOO = 4;
/** Irregular verb type (ã
-final) */
- public static final int VERB_TYPE_SIUT = 'S';
+ public static final int VERB_TYPE_SIUT = 5;
/** Irregular verb type (ã·-final) */
- public static final int VERB_TYPE_DI = 'D';
+ public static final int VERB_TYPE_DI = 6;
/** Irregular verb type (ë¬-final) */
- public static final int VERB_TYPE_RU = 'R';
-
- /** Regular verb type */
- public static final int VERB_TYPE_REGULAR = 'X';
+ public static final int VERB_TYPE_RU = 7;
/**
* ë¨ì´
@@ -62,20 +62,22 @@ public class WordEntry {
/**
* ë¨ì´í¹ì±
*/
- private final char[] features;
+ private final char features;
private final List<CompoundEntry> compounds;
- public WordEntry(String word, char[] cs, List<CompoundEntry> compounds) {
- if (cs.length != 10) {
- throw new IllegalArgumentException("invalid features for word: " + word + ", got:" + new String(cs));
- }
+ public WordEntry(String word, int features, List<CompoundEntry> compounds) {
+ if (features < 0 || features >= 1024) {
+ throw new IllegalArgumentException("Invalid features: " + Integer.toHexString(features));
+ }
this.word = word;
- this.features = cs;
+ this.features = (char) features;
this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
+ // make sure compound nouns are also nouns
+ assert !isCompoundNoun() || isNoun();
// has compound list iff compound feature is set
- assert (features[IDX_NOUN] == '2' && compounds != null && compounds.size() > 1)
- || (features[IDX_NOUN] != '2' && compounds == null) : "inconsistent compound data for word: " + word;
+ assert (isCompoundNoun() && compounds.size() > 1)
+ || (!isCompoundNoun() && compounds == null) : "inconsistent compound data for word: " + word;
}
public String getWord() {
@@ -84,12 +86,12 @@ public class WordEntry {
/** Returns true if the entry is a noun (or compound noun) */
public boolean isNoun() {
- return features[IDX_NOUN] != '0';
+ return (features & NOUN) != 0;
}
/** Returns true if entry is a compound noun */
public boolean isCompoundNoun() {
- return features[IDX_NOUN] == '2';
+ return (features & COMPOUND) != 0;
}
/** Returns List of compounds for word */
@@ -100,31 +102,31 @@ public class WordEntry {
/** Returns true if entry is verb */
public boolean isVerb() {
- return features[IDX_VERB] == '1';
+ return (features & VERB) != 0;
}
- /** Returns verb type (IRR_TYPE_REGULAR or irregular type) */
+ /** Returns verb type (VERB_TYPE_REGULAR or irregular ending type) */
public int getVerbType() {
- return features[IDX_REGURA];
+ return features & 0x7;
}
/** Returns true if entry is busa (adverb) */
public boolean isAdverb() {
- return features[IDX_BUSA] == '1';
+ return (features & BUSA) != 0;
}
/** allows noun analysis with -í verb suffix */
public boolean hasDOV() {
- return features[IDX_DOV] == '1';
+ return (features & DOV) != 0;
}
/** allows noun analysis with -ë verb suffix */
public boolean hasBEV() {
- return features[IDX_BEV] == '1';
+ return (features & BEV) != 0;
}
/** allows noun analysis with -ë´ verb suffix */
public boolean hasNE() {
- return features[IDX_NE] == '1';
+ return (features & NE) != 0;
}
}