You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 08:56:30 UTC

svn commit: r1534040 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic: DictionaryUtil.java WordEntry.java

Author: rmuir
Date: Mon Oct 21 06:56:30 2013
New Revision: 1534040

URL: http://svn.apache.org/r1534040
Log:
LUCENE-4956: don't hold thousands of arrays in dictionary

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534040&r1=1534039&r2=1534040&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 06:56:30 2013
@@ -19,11 +19,9 @@ package org.apache.lucene.analysis.ko.di
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 
 import org.apache.lucene.analysis.ko.utils.Trie;
@@ -56,7 +54,7 @@ public class DictionaryUtil {
             throw new IOException("Invalid file format: " + line);
           }
           
-          WordEntry entry = new WordEntry(infos[0].trim(),infos[1].toCharArray(), null);
+          WordEntry entry = new WordEntry(infos[0].trim(), parseFlags(infos[1]), null);
           dictionary.add(entry.getWord(), entry);          
         }
       };
@@ -75,7 +73,7 @@ public class DictionaryUtil {
           }
           
           final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
-          final WordEntry entry = new WordEntry(infos[0].trim(),("200"+infos[2]+"00X").toCharArray(), c);
+          final WordEntry entry = new WordEntry(infos[0].trim(), parseFlags("200"+infos[2]+"00X"), c);
           dictionary.add(entry.getWord(), entry);          
         }       
       }); 
@@ -222,4 +220,62 @@ public class DictionaryUtil {
     }
     return list;
   }
+  
+  // TODO: move all this to build time
+  private static int parseFlags(String buffer) {
+    if (buffer.length() != 10) {
+      throw new IllegalArgumentException("Invalid flags: " + buffer);
+    }
+    int flags = 0;
+    // IDX_NOUN: 1 if noun, 2 if compound
+    if (buffer.charAt(0) == '2') {
+      flags |= WordEntry.COMPOUND | WordEntry.NOUN;
+    } else if (buffer.charAt(0) == '1') {
+      flags |= WordEntry.NOUN;
+    } else if (buffer.charAt(0) != '0') {
+      throw new IllegalArgumentException("Invalid flags: " + buffer);
+    }
+    // IDX_VERB
+    if (parseBoolean(buffer, 1)) {
+      flags |= WordEntry.VERB;
+    }
+    // IDX_BUSA
+    if (parseBoolean(buffer, 2)) {
+      flags |= WordEntry.BUSA;
+    }
+    // IDX_DOV
+    if (parseBoolean(buffer, 3)) {
+      flags |= WordEntry.DOV;
+    }
+    // IDX_BEV
+    if (parseBoolean(buffer, 4)) {
+      flags |= WordEntry.BEV;
+    }
+    // IDX_NE
+    if (parseBoolean(buffer, 5)) {
+      flags |= WordEntry.NE;
+    }
+    // IDX_REGURA
+    switch(buffer.charAt(9)) {
+      case 'B': return flags | WordEntry.VERB_TYPE_BIUP;
+      case 'H': return flags | WordEntry.VERB_TYPE_HIOOT;
+      case 'U': return flags | WordEntry.VERB_TYPE_LIUL;
+      case 'L': return flags | WordEntry.VERB_TYPE_LOO;
+      case 'S': return flags | WordEntry.VERB_TYPE_SIUT;
+      case 'D': return flags | WordEntry.VERB_TYPE_DI;
+      case 'R': return flags | WordEntry.VERB_TYPE_RU;
+      case 'X': return flags | WordEntry.VERB_TYPE_REGULAR;
+      default: throw new IllegalArgumentException("Invalid flags: " + buffer);
+    }
+  }
+  
+  private static boolean parseBoolean(String buffer, int position) {
+    if (buffer.charAt(position) == '1') {
+      return true;
+    } else if (buffer.charAt(position) == '0') {
+      return false;
+    } else {
+      throw new IllegalArgumentException("Invalid flags: " + buffer);
+    }
+  }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java?rev=1534040&r1=1534039&r2=1534040&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/WordEntry.java Mon Oct 21 06:56:30 2013
@@ -22,37 +22,37 @@ import java.util.List;
 
 public class WordEntry {
 
-  private static final int IDX_NOUN = 0;
-  private static final int IDX_VERB = 1;
-  private static final int IDX_BUSA = 2;
-  private static final int IDX_DOV = 3;
-  private static final int IDX_BEV = 4;
-  private static final int IDX_NE = 5;
-  private static final int IDX_REGURA = 9;
+  static final int NOUN =     1 << 3;
+  static final int VERB =     1 << 4;
+  static final int BUSA =     1 << 5;
+  static final int DOV =      1 << 6;
+  static final int BEV =      1 << 7;
+  static final int NE  =      1 << 8;
+  static final int COMPOUND = 1 << 9;
+  
+  /** Regular verb type */
+  public static final int VERB_TYPE_REGULAR = 0;
   
   /** Irregular verb type (ㅂ-final) */
-  public static final int VERB_TYPE_BIUP = 'B';
+  public static final int VERB_TYPE_BIUP = 1;
   
   /** Irregular verb type (ㅎ-final) */
-  public static final int VERB_TYPE_HIOOT = 'H';
+  public static final int VERB_TYPE_HIOOT = 2;
   
   /** Irregular verb type (ㄹ-final) */
-  public static final int VERB_TYPE_LIUL = 'U';
+  public static final int VERB_TYPE_LIUL = 3;
   
   /** Irregular verb type (르-final) */
-  public static final int VERB_TYPE_LOO = 'L';
+  public static final int VERB_TYPE_LOO = 4;
 
   /** Irregular verb type (ㅅ-final) */
-  public static final int VERB_TYPE_SIUT = 'S';
+  public static final int VERB_TYPE_SIUT = 5;
   
   /** Irregular verb type (ㄷ-final) */
-  public static final int VERB_TYPE_DI = 'D';
+  public static final int VERB_TYPE_DI = 6;
   
   /** Irregular verb type (러-final) */
-  public static final int VERB_TYPE_RU = 'R';
-  
-  /** Regular verb type */
-  public static final int VERB_TYPE_REGULAR = 'X';
+  public static final int VERB_TYPE_RU = 7;
   
   /**
    * 단어
@@ -62,20 +62,22 @@ public class WordEntry {
   /**
    * 단어특성
    */
-  private final char[] features;
+  private final char features;
   
   private final List<CompoundEntry> compounds;
   
-  public WordEntry(String word, char[] cs, List<CompoundEntry> compounds) {
-    if (cs.length != 10) {
-      throw new IllegalArgumentException("invalid features for word: " + word + ", got:" + new String(cs));
-    } 
+  public WordEntry(String word, int features, List<CompoundEntry> compounds) {
+    if (features < 0 || features >= 1024) {
+      throw new IllegalArgumentException("Invalid features: " + Integer.toHexString(features));
+    }
     this.word = word;
-    this.features = cs;
+    this.features = (char) features;
     this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
+    // make sure compound nouns are also nouns
+    assert !isCompoundNoun() || isNoun();
     // has compound list iff compound feature is set
-    assert (features[IDX_NOUN] == '2' && compounds != null && compounds.size() > 1) 
-        || (features[IDX_NOUN] != '2' && compounds == null) : "inconsistent compound data for word: " + word;
+    assert (isCompoundNoun() && compounds.size() > 1) 
+        || (!isCompoundNoun() && compounds == null) : "inconsistent compound data for word: " + word;
   }
   
   public String getWord() {
@@ -84,12 +86,12 @@ public class WordEntry {
   
   /** Returns true if the entry is a noun (or compound noun) */
   public boolean isNoun() {
-    return features[IDX_NOUN] != '0';
+    return (features & NOUN) != 0;
   }
   
   /** Returns true if entry is a compound noun */
   public boolean isCompoundNoun() {
-    return features[IDX_NOUN] == '2';
+    return (features & COMPOUND) != 0;
   }
   
   /** Returns List of compounds for word */
@@ -100,31 +102,31 @@ public class WordEntry {
   
   /** Returns true if entry is verb */
   public boolean isVerb() {
-    return features[IDX_VERB] == '1';
+    return (features & VERB) != 0;
   }
   
-  /** Returns verb type (IRR_TYPE_REGULAR or irregular type) */
+  /** Returns verb type (VERB_TYPE_REGULAR or irregular ending type) */
   public int getVerbType() {
-    return features[IDX_REGURA];
+    return features & 0x7;
   }
   
   /** Returns true if entry is busa (adverb) */
   public boolean isAdverb() {
-    return features[IDX_BUSA] == '1';
+    return (features & BUSA) != 0;
   }
   
   /** allows noun analysis with -하 verb suffix */
   public boolean hasDOV() {
-    return features[IDX_DOV] == '1';
+    return (features & DOV) != 0;
   }
   
   /** allows noun analysis with -되 verb suffix */
   public boolean hasBEV() {
-    return features[IDX_BEV] == '1';
+    return (features & BEV) != 0;
   }
   
   /** allows noun analysis with -내 verb suffix */
   public boolean hasNE() {
-    return features[IDX_NE] == '1';
+    return (features & NE) != 0;
   }
 }