You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 15:06:09 UTC
svn commit: r1534141 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/dic/
resources/org/apache/lucene/analysis/ko/dic/
tools/java/org/apache/lucene/analysis/ko/dic/
Author: rmuir
Date: Mon Oct 21 13:06:08 2013
New Revision: 1534141
URL: http://svn.apache.org/r1534141
Log:
LUCENE-4956: add some cleanups, remove packing, add missing close, lazy-load compound data until you ask for it
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 13:06:08 2013
@@ -22,6 +22,7 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import org.apache.lucene.codecs.CodecUtil;
@@ -82,91 +83,105 @@ public class DictionaryUtil {
}
}
+ /** true if something with this prefix exists */
public static boolean hasWordPrefix(CharSequence prefix) {
return dictionary.hasPrefix(prefix);
}
/** only use this if you surely need the whole entry */
public static WordEntry getWord(String key) {
- Byte b = dictionary.lookup(key);
- if (b == null) {
+ Byte clazz = dictionary.lookup(key);
+ if (clazz == null) {
return null;
} else {
- return dictionary.decodeEntry(key, b);
+ return new WordEntry(key, dictionary.getFlags(clazz), clazz);
}
}
+ /** Looks up noun, compound noun, or adverb */
public static WordEntry getWordExceptVerb(String key) {
- Byte b = dictionary.lookup(key);
- if (b == null) {
+ Byte clazz = dictionary.lookup(key);
+ if (clazz == null) {
return null;
}
- char flags = dictionary.getFlags(b);
+ char flags = dictionary.getFlags(clazz);
if ((flags & (WordEntry.NOUN | WordEntry.BUSA)) != 0) {
- return dictionary.decodeEntry(key, b, flags);
+ return new WordEntry(key, flags, clazz);
} else {
return null;
}
}
+ /** Looks up a noun (but not compound noun) */
public static WordEntry getNoun(String key) {
- Byte b = dictionary.lookup(key);
- if (b == null) {
+ Byte clazz = dictionary.lookup(key);
+ if (clazz == null) {
return null;
}
- char flags = dictionary.getFlags(b);
+ char flags = dictionary.getFlags(clazz);
if ((flags & WordEntry.NOUN) != 0 && (flags & WordEntry.COMPOUND) == 0) {
- return dictionary.decodeEntry(key, b, flags);
+ return new WordEntry(key, flags, clazz);
} else {
return null;
}
}
/**
- *
* return all noun including compound noun
- * @param key the lookup key text
- * @return WordEntry
*/
public static WordEntry getAllNoun(String key) {
- Byte b = dictionary.lookup(key);
- if (b == null) {
+ Byte clazz = dictionary.lookup(key);
+ if (clazz == null) {
return null;
}
- char flags = dictionary.getFlags(b);
+ char flags = dictionary.getFlags(clazz);
if ((flags & WordEntry.NOUN) != 0) {
- return dictionary.decodeEntry(key, b, flags);
+ return new WordEntry(key, flags, clazz);
} else {
return null;
}
}
+ /**
+ * returns any verb
+ */
public static WordEntry getVerb(String key) {
- Byte b = dictionary.lookup(key);
- if (b == null) {
+ Byte clazz = dictionary.lookup(key);
+ if (clazz == null) {
return null;
}
- char flags = dictionary.getFlags(b);
+ char flags = dictionary.getFlags(clazz);
if ((flags & WordEntry.VERB) != 0) {
- return dictionary.decodeEntry(key, b, flags);
+ return new WordEntry(key, flags, clazz);
} else {
return null;
}
}
+ /** Looks up an adverb-only */
public static WordEntry getBusa(String key) {
- Byte b = dictionary.lookup(key);
- if (b == null) {
+ Byte clazz = dictionary.lookup(key);
+ if (clazz == null) {
return null;
}
- char flags = dictionary.getFlags(b);
+ char flags = dictionary.getFlags(clazz);
if ((flags & WordEntry.BUSA) != 0 && (flags & WordEntry.NOUN) == 0) {
- return dictionary.decodeEntry(key, b, flags);
+ return new WordEntry(key, flags, clazz);
} else {
return null;
}
}
+ /** return list of irregular compounds for word class. */
+ static List<CompoundEntry> getIrregularCompounds(byte clazz) {
+ return dictionary.getIrregularCompounds(clazz);
+ }
+
+ /** return list of compounds for key and word class. */
+ static List<CompoundEntry> getCompounds(String key, byte clazz) {
+ return dictionary.getCompounds(key, clazz);
+ }
+
// TODO: make this more efficient later
public static boolean isUncompound(String before, String after) {
return uncompounds.contains(before + "," + after);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 21 13:06:08 2013
@@ -35,6 +35,7 @@ class HangulDictionary {
this.metadata = metadata;
}
+ /** looks up word class for a word (exact match) */
Byte lookup(String key) {
// TODO: why is does this thing lookup empty strings?
if (key.length() == 0) {
@@ -65,27 +66,16 @@ class HangulDictionary {
}
}
- char getFlags(byte b) {
- int off = b * RECORD_SIZE;
+ /** looks up features for word class */
+ char getFlags(byte clazz) {
+ int off = clazz * RECORD_SIZE;
return (char)((metadata[off] << 8) | (metadata[off+1] & 0xff));
}
- WordEntry decodeEntry(String key, byte b) {
- return decodeEntry(key, b, getFlags(b));
- }
-
- WordEntry decodeEntry(String key, byte b, char flags) {
- if ((flags & WordEntry.COMPOUND_IRREGULAR) != 0) {
- return new WordEntry(key, flags, getIrregularCompounds(key, b));
- } else if ((flags & WordEntry.COMPOUND) != 0) {
- return new WordEntry(key, flags, getCompounds(key, b));
- } else {
- return new WordEntry(key, flags, null);
- }
- }
-
- List<CompoundEntry> getCompounds(String word, byte b) {
- int off = b * RECORD_SIZE;
+ /** return list of compounds for key and word class.
+ * this retrieves the splits for the class and applies them to the key */
+ List<CompoundEntry> getCompounds(String word, byte clazz) {
+ int off = clazz * RECORD_SIZE;
int numSplits = metadata[off+2];
assert numSplits > 0;
List<CompoundEntry> compounds = new ArrayList<>(numSplits+1);
@@ -99,8 +89,10 @@ class HangulDictionary {
return compounds;
}
- List<CompoundEntry> getIrregularCompounds(String word, byte b) {
- int off = b * RECORD_SIZE;
+ /** return list of compounds for key and word class.
+ * this retrieves the decompounded data for this irregular class */
+ List<CompoundEntry> getIrregularCompounds(byte clazz) {
+ int off = clazz * RECORD_SIZE;
int numChars = metadata[off+2];
// TODO: more efficient
List<CompoundEntry> compounds = new ArrayList<>();
@@ -119,6 +111,7 @@ class HangulDictionary {
return compounds;
}
+ /** walks the fst for prefix and returns true if it his no dead end */
boolean hasPrefix(CharSequence key) {
final FST.Arc<Byte> arc = fst.getFirstArc(new FST.Arc<Byte>());
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java Mon Oct 21 13:06:08 2013
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ko.di
* limitations under the License.
*/
-import java.util.Collections;
import java.util.List;
public class WordEntry {
@@ -63,22 +62,19 @@ public class WordEntry {
/**
* ë¨ì´í¹ì±
*/
- final char features;
+ private final char features;
- final List<CompoundEntry> compounds;
+ private final byte clazz;
- public WordEntry(String word, int features, List<CompoundEntry> compounds) {
+ WordEntry(String word, char features, byte clazz) {
if (features < 0 || features >= 2048) {
throw new IllegalArgumentException("Invalid features: " + Integer.toHexString(features));
}
this.word = word;
this.features = (char) features;
- this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
+ this.clazz = clazz;
// make sure compound nouns are also nouns
assert !isCompoundNoun() || isNoun();
- // has compound list iff compound feature is set
- assert (isCompoundNoun() && compounds.size() > 1)
- || (!isCompoundNoun() && compounds == null) : "inconsistent compound data for word: " + word;
}
public String getWord() {
@@ -98,7 +94,12 @@ public class WordEntry {
/** Returns List of compounds for word */
public List<CompoundEntry> getCompounds() {
assert isCompoundNoun();
- return compounds;
+ // TODO: should we cache this here? see if someone is calling this repeatedly? i hope not.
+ if ((features & COMPOUND_IRREGULAR) != 0) {
+ return DictionaryUtil.getIrregularCompounds(clazz);
+ } else {
+ return DictionaryUtil.getCompounds(word, clazz);
+ }
}
/** Returns true if entry is verb */
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Mon Oct 21 13:06:08 2013
@@ -224,6 +224,11 @@ public class DictionaryBuilder {
}
+ /**
+ * makes FST (currently byte2 syllables) mapping to "word class"
+ * each word has features + compound data, but many of them share the
+ * same set of features, and have simple compound splits in the same place.
+ */
static void buildHangulDict(File inputDir, File outputDir) throws Exception {
TreeMap<String,Integer> sorted = new TreeMap<String,Integer>();
Map<Output,Integer> classes = new LinkedHashMap<>();
@@ -255,9 +260,8 @@ public class DictionaryBuilder {
System.out.println("#words: " + sorted.size());
System.out.println("#classes: " + classes.size());
Outputs<Byte> fstOutput = ByteOutputs.getSingleton();
- // makes corrupt FST!!!!
- // Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
- Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
+ // why does packed=false give a smaller fst?!?!
+ Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
IntsRef scratch = new IntsRef();
for (Map.Entry<String,Integer> e : sorted.entrySet()) {
String token = e.getKey();
@@ -280,6 +284,7 @@ public class DictionaryBuilder {
o.write(out);
}
fst.save(out);
+ stream.close();
}
static void processLine(String line, TreeMap<String,Integer> sorted, Map<Output,Integer> classes) {