You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 15:06:09 UTC

svn commit: r1534141 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: java/org/apache/lucene/analysis/ko/dic/ resources/org/apache/lucene/analysis/ko/dic/ tools/java/org/apache/lucene/analysis/ko/dic/

Author: rmuir
Date: Mon Oct 21 13:06:08 2013
New Revision: 1534141

URL: http://svn.apache.org/r1534141
Log:
LUCENE-4956: add some cleanups, remove packing, add missing close, lazy-load compound data until you ask for it

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 13:06:08 2013
@@ -22,6 +22,7 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.apache.lucene.codecs.CodecUtil;
@@ -82,91 +83,105 @@ public class DictionaryUtil {
     }
   }
   
+  /** true if something with this prefix exists */
   public static boolean hasWordPrefix(CharSequence prefix) {
     return dictionary.hasPrefix(prefix);
   }
 
   /** only use this if you surely need the whole entry */
   public static WordEntry getWord(String key) {    
-    Byte b = dictionary.lookup(key);
-    if (b == null) {
+    Byte clazz = dictionary.lookup(key);
+    if (clazz == null) {
       return null;
     } else {
-      return dictionary.decodeEntry(key, b);
+      return new WordEntry(key, dictionary.getFlags(clazz), clazz);
     }
   }
   
+  /** Looks up noun, compound noun, or adverb */
   public static WordEntry getWordExceptVerb(String key) {
-    Byte b = dictionary.lookup(key);
-    if (b == null) {
+    Byte clazz = dictionary.lookup(key);
+    if (clazz == null) {
       return null;
     }
-    char flags = dictionary.getFlags(b);
+    char flags = dictionary.getFlags(clazz);
     if ((flags & (WordEntry.NOUN | WordEntry.BUSA)) != 0) {
-      return dictionary.decodeEntry(key, b, flags);
+      return new WordEntry(key, flags, clazz);
     } else {
       return null;
     }
   }
   
+  /** Looks up a noun (but not compound noun) */
   public static WordEntry getNoun(String key) {
-    Byte b = dictionary.lookup(key);
-    if (b == null) {
+    Byte clazz = dictionary.lookup(key);
+    if (clazz == null) {
       return null;
     }
-    char flags = dictionary.getFlags(b);
+    char flags = dictionary.getFlags(clazz);
     if ((flags & WordEntry.NOUN) != 0 && (flags & WordEntry.COMPOUND) == 0) {
-      return dictionary.decodeEntry(key, b, flags);
+      return new WordEntry(key, flags, clazz);
     } else {
       return null;
     }
   }
   
   /**
-   * 
    * return all noun including compound noun
-   * @param key the lookup key text
-   * @return  WordEntry
    */
   public static WordEntry getAllNoun(String key) {  
-    Byte b = dictionary.lookup(key);
-    if (b == null) {
+    Byte clazz = dictionary.lookup(key);
+    if (clazz == null) {
       return null;
     }
-    char flags = dictionary.getFlags(b);
+    char flags = dictionary.getFlags(clazz);
     if ((flags & WordEntry.NOUN) != 0) {
-      return dictionary.decodeEntry(key, b, flags);
+      return new WordEntry(key, flags, clazz);
     } else {
       return null;
     }
   }
   
+  /**
+   * returns any verb
+   */
   public static WordEntry getVerb(String key) {
-    Byte b = dictionary.lookup(key);
-    if (b == null) {
+    Byte clazz = dictionary.lookup(key);
+    if (clazz == null) {
       return null;
     }
-    char flags = dictionary.getFlags(b);
+    char flags = dictionary.getFlags(clazz);
     if ((flags & WordEntry.VERB) != 0) {
-      return dictionary.decodeEntry(key, b, flags);
+      return new WordEntry(key, flags, clazz);
     } else {
       return null;
     }
   }
   
+  /** Looks up an adverb-only */
   public static WordEntry getBusa(String key) {
-    Byte b = dictionary.lookup(key);
-    if (b == null) {
+    Byte clazz = dictionary.lookup(key);
+    if (clazz == null) {
       return null;
     }
-    char flags = dictionary.getFlags(b);
+    char flags = dictionary.getFlags(clazz);
     if ((flags & WordEntry.BUSA) != 0 && (flags & WordEntry.NOUN) == 0) {
-      return dictionary.decodeEntry(key, b, flags);
+      return new WordEntry(key, flags, clazz);
     } else {
       return null;
     }
   }
   
+  /** return list of irregular compounds for word class. */
+  static List<CompoundEntry> getIrregularCompounds(byte clazz) {
+    return dictionary.getIrregularCompounds(clazz);
+  }
+  
+  /** return list of compounds for key and word class. */
+  static List<CompoundEntry> getCompounds(String key, byte clazz) {
+    return dictionary.getCompounds(key, clazz);
+  }
+  
   // TODO: make this more efficient later
   public static boolean isUncompound(String before, String after) {
     return uncompounds.contains(before + "," + after);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 21 13:06:08 2013
@@ -35,6 +35,7 @@ class HangulDictionary {
     this.metadata = metadata;
   }
   
+  /** looks up word class for a word (exact match) */
   Byte lookup(String key) {
     // TODO: why is does this thing lookup empty strings?
     if (key.length() == 0) {
@@ -65,27 +66,16 @@ class HangulDictionary {
     }
   }
   
-  char getFlags(byte b) {
-    int off = b * RECORD_SIZE;
+  /** looks up features for word class */
+  char getFlags(byte clazz) {
+    int off = clazz * RECORD_SIZE;
     return (char)((metadata[off] << 8) | (metadata[off+1] & 0xff));
   }
   
-  WordEntry decodeEntry(String key, byte b) {
-    return decodeEntry(key, b, getFlags(b));
-  }
-  
-  WordEntry decodeEntry(String key, byte b, char flags) {
-    if ((flags & WordEntry.COMPOUND_IRREGULAR) != 0) {
-      return new WordEntry(key, flags, getIrregularCompounds(key, b));
-    } else if ((flags & WordEntry.COMPOUND) != 0) {
-      return new WordEntry(key, flags, getCompounds(key, b));
-    } else {
-      return new WordEntry(key, flags, null);
-    }
-  }
-  
-  List<CompoundEntry> getCompounds(String word, byte b) {
-    int off = b * RECORD_SIZE;
+  /** return list of compounds for key and word class.
+   * this retrieves the splits for the class and applies them to the key */
+  List<CompoundEntry> getCompounds(String word, byte clazz) {
+    int off = clazz * RECORD_SIZE;
     int numSplits = metadata[off+2];
     assert numSplits > 0;
     List<CompoundEntry> compounds = new ArrayList<>(numSplits+1);
@@ -99,8 +89,10 @@ class HangulDictionary {
     return compounds;
   }
   
-  List<CompoundEntry> getIrregularCompounds(String word, byte b) {
-    int off = b * RECORD_SIZE;
+  /** return list of compounds for key and word class.
+   * this retrieves the decompounded data for this irregular class */
+  List<CompoundEntry> getIrregularCompounds(byte clazz) {
+    int off = clazz * RECORD_SIZE;
     int numChars = metadata[off+2];
     // TODO: more efficient
     List<CompoundEntry> compounds = new ArrayList<>();
@@ -119,6 +111,7 @@ class HangulDictionary {
     return compounds;
   }
   
+  /** walks the fst for prefix and returns true if it his no dead end */
   boolean hasPrefix(CharSequence key) {
     final FST.Arc<Byte> arc = fst.getFirstArc(new FST.Arc<Byte>());
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java Mon Oct 21 13:06:08 2013
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.ko.di
  * limitations under the License.
  */
 
-import java.util.Collections;
 import java.util.List;
 
 public class WordEntry {
@@ -63,22 +62,19 @@ public class WordEntry {
   /**
    * 단어특성
    */
-  final char features;
+  private final char features;
   
-  final List<CompoundEntry> compounds;
+  private final byte clazz;
   
-  public WordEntry(String word, int features, List<CompoundEntry> compounds) {
+  WordEntry(String word, char features, byte clazz) {
     if (features < 0 || features >= 2048) {
       throw new IllegalArgumentException("Invalid features: " + Integer.toHexString(features));
     }
     this.word = word;
     this.features = (char) features;
-    this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
+    this.clazz = clazz;
     // make sure compound nouns are also nouns
     assert !isCompoundNoun() || isNoun();
-    // has compound list iff compound feature is set
-    assert (isCompoundNoun() && compounds.size() > 1) 
-        || (!isCompoundNoun() && compounds == null) : "inconsistent compound data for word: " + word;
   }
   
   public String getWord() {
@@ -98,7 +94,12 @@ public class WordEntry {
   /** Returns List of compounds for word */
   public List<CompoundEntry> getCompounds() {
     assert isCompoundNoun();
-    return compounds;
+    // TODO: should we cache this here? see if someone is calling this repeatedly? i hope not.
+    if ((features & COMPOUND_IRREGULAR) != 0) {
+      return DictionaryUtil.getIrregularCompounds(clazz);
+    } else {
+      return DictionaryUtil.getCompounds(word, clazz);
+    }
   }
   
   /** Returns true if entry is verb */

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/words.dat?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1534141&r1=1534140&r2=1534141&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Mon Oct 21 13:06:08 2013
@@ -224,6 +224,11 @@ public class DictionaryBuilder {
   }
   
   
+  /** 
+   * makes FST (currently byte2 syllables) mapping to "word class"
+   * each word has features + compound data, but many of them share the
+   * same set of features, and have simple compound splits in the same place.
+   */
   static void buildHangulDict(File inputDir, File outputDir) throws Exception {
     TreeMap<String,Integer> sorted = new TreeMap<String,Integer>();
     Map<Output,Integer> classes = new LinkedHashMap<>();
@@ -255,9 +260,8 @@ public class DictionaryBuilder {
     System.out.println("#words: " + sorted.size());
     System.out.println("#classes: " + classes.size());
     Outputs<Byte> fstOutput = ByteOutputs.getSingleton();
-    //    makes corrupt FST!!!!
-    //     Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
-    Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
+    // why does packed=false give a smaller fst?!?!
+    Builder<Byte> builder = new Builder<Byte>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, false, PackedInts.DEFAULT, true, 15);
     IntsRef scratch = new IntsRef();
     for (Map.Entry<String,Integer> e : sorted.entrySet()) {
       String token = e.getKey();
@@ -280,6 +284,7 @@ public class DictionaryBuilder {
       o.write(out);
     }
     fst.save(out);
+    stream.close();
   }
   
   static void processLine(String line, TreeMap<String,Integer> sorted, Map<Output,Integer> classes) {