You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 14:31:34 UTC

svn commit: r1534128 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/dic/ java/org/apache/lucene/analysis/ko/morph/ java/org/apache/lucene/analysis/ko/utils/ resou...

Author: rmuir
Date: Mon Oct 21 12:31:34 2013
New Revision: 1534128

URL: http://svn.apache.org/r1534128
Log:
LUCENE-4956: remove trie

Removed:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/utils/
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/dictionary.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic
Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Mon Oct 21 12:31:34 2013
@@ -312,9 +312,9 @@ public final class KoreanFilter extends 
           sb.append(chs[k]);          
           if(k>0)  candiList.add(sb);
           
-          Iterator<String[]> iter = DictionaryUtil.findWithPrefix(sb.toString());
-          if(!iter.hasNext()) // 사전에 없으면 삭제 후보
-            removeList.add(sb);    
+          if (!DictionaryUtil.hasWordPrefix(sb)) {
+            removeList.add(sb); // 사전에 없으면 삭제 후보
+          }  
         }        
       }            
 

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Mon Oct 21 12:31:34 2013
@@ -30,20 +30,14 @@ import org.apache.lucene.util.IOUtils;
  */
 public class DictionaryResources {
   
-  public static final String FILE_DICTIONARY = "dictionary.dic";  
-  
   public static final String FILE_JOSA = "josa.dic";
   
   public static final String FILE_EOMI = "eomi.dic";
   
-  public static final String FILE_EXTENSION = "extension.dic";
-  
   public static final String FILE_PREFIX = "prefix.dic";
   
   public static final String FILE_SUFFIX = "suffix.dic";  
   
-  public static final String FILE_COMPOUNDS = "compounds.dic";  
-  
   public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
   
   public static final String FILE_SYLLABLE_DAT = "syllable.dat";

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 12:31:34 2013
@@ -20,13 +20,9 @@ package org.apache.lucene.analysis.ko.di
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
 import java.util.Set;
 
-import org.apache.lucene.analysis.ko.utils.Trie;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.InputStreamDataInput;
@@ -35,9 +31,7 @@ import org.apache.lucene.util.fst.FST;
 public class DictionaryUtil {
   private DictionaryUtil() {}
   
-  private static final Trie<String,WordEntry> dictionary = new Trie<String, WordEntry>(false);
-  
-  private static final HangulDictionary newDictionary;
+  private static final HangulDictionary dictionary;
   
   private static final Set<String> josas = new HashSet<String>();
   
@@ -51,41 +45,6 @@ public class DictionaryUtil {
   
   static {  
     try {
-      final LineProcessor proc = new LineProcessor() {
-        @Override
-        public void processLine(String line) throws IOException {
-          String[] infos = line.split("[,]+");
-          if (infos.length != 2) {
-            throw new IOException("Invalid file format: " + line);
-          }
-          if (infos[1].length() != 10) {
-            throw new IOException("Invalid file format: " + line);
-          }
-          
-          WordEntry entry = new WordEntry(infos[0].trim(), parseFlags(infos[1]), null);
-          dictionary.add(entry.getWord(), entry);          
-        }
-      };
-      DictionaryResources.readLines(DictionaryResources.FILE_DICTIONARY, proc);
-      DictionaryResources.readLines(DictionaryResources.FILE_EXTENSION, proc);
-      
-      DictionaryResources.readLines(DictionaryResources.FILE_COMPOUNDS, new LineProcessor() {
-        @Override
-        public void processLine(String compound) throws IOException {
-          String[] infos = compound.split("[:]+");
-          if (infos.length != 3) {
-            throw new IOException("Invalid file format: " + compound);
-          }
-          if (infos[2].length() != 4) {
-            throw new IOException("Illegal file format: " + compound);
-          }
-          
-          final List<CompoundEntry> c = compoundArrayToList(infos[1], infos[1].split("[,]+"));
-          final WordEntry entry = new WordEntry(infos[0].trim(), parseFlags("200"+infos[2]+"00X"), c);
-          dictionary.add(entry.getWord(), entry);          
-        }       
-      }); 
-      
       DictionaryResources.readLines(DictionaryResources.FILE_UNCOMPOUNDS, new LineProcessor() {
         @Override
         public void processLine(String compound) throws IOException {
@@ -112,49 +71,48 @@ public class DictionaryUtil {
       dat.readBytes(metadata, 0, metadata.length);
       ByteOutputs outputs = ByteOutputs.getSingleton();
       FST<Byte> fst = new FST<Byte>(dat, outputs);
-      newDictionary = new HangulDictionary(fst, metadata);
+      dictionary = new HangulDictionary(fst, metadata);
       stream.close();
     } catch (IOException e) {
       throw new Error("Cannot load resource",e);
     }
   }
-
-  @SuppressWarnings({"rawtypes","unchecked"})
-  public static Iterator<String[]> findWithPrefix(String prefix) {
-    return dictionary.getPrefixedBy(prefix);
+  
+  public static boolean hasWordPrefix(CharSequence prefix) {
+    return dictionary.hasPrefix(prefix);
   }
 
   /** only use this if you surely need the whole entry */
   public static WordEntry getWord(String key) {    
-    Byte b = newDictionary.lookup(key);
+    Byte b = dictionary.lookup(key);
     if (b == null) {
       return null;
     } else {
-      return newDictionary.decodeEntry(key, b);
+      return dictionary.decodeEntry(key, b);
     }
   }
   
   public static WordEntry getWordExceptVerb(String key) {
-    Byte b = newDictionary.lookup(key);
+    Byte b = dictionary.lookup(key);
     if (b == null) {
       return null;
     }
-    char flags = newDictionary.getFlags(b);
+    char flags = dictionary.getFlags(b);
     if ((flags & (WordEntry.NOUN | WordEntry.BUSA)) != 0) {
-      return newDictionary.decodeEntry(key, b, flags);
+      return dictionary.decodeEntry(key, b, flags);
     } else {
       return null;
     }
   }
   
   public static WordEntry getNoun(String key) {
-    Byte b = newDictionary.lookup(key);
+    Byte b = dictionary.lookup(key);
     if (b == null) {
       return null;
     }
-    char flags = newDictionary.getFlags(b);
+    char flags = dictionary.getFlags(b);
     if ((flags & WordEntry.NOUN) != 0 && (flags & WordEntry.COMPOUND) == 0) {
-      return newDictionary.decodeEntry(key, b, flags);
+      return dictionary.decodeEntry(key, b, flags);
     } else {
       return null;
     }
@@ -167,39 +125,39 @@ public class DictionaryUtil {
    * @return  WordEntry
    */
   public static WordEntry getAllNoun(String key) {  
-    Byte b = newDictionary.lookup(key);
+    Byte b = dictionary.lookup(key);
     if (b == null) {
       return null;
     }
-    char flags = newDictionary.getFlags(b);
+    char flags = dictionary.getFlags(b);
     if ((flags & WordEntry.NOUN) != 0) {
-      return newDictionary.decodeEntry(key, b, flags);
+      return dictionary.decodeEntry(key, b, flags);
     } else {
       return null;
     }
   }
   
   public static WordEntry getVerb(String key) {
-    Byte b = newDictionary.lookup(key);
+    Byte b = dictionary.lookup(key);
     if (b == null) {
       return null;
     }
-    char flags = newDictionary.getFlags(b);
+    char flags = dictionary.getFlags(b);
     if ((flags & WordEntry.VERB) != 0) {
-      return newDictionary.decodeEntry(key, b, flags);
+      return dictionary.decodeEntry(key, b, flags);
     } else {
       return null;
     }
   }
   
   public static WordEntry getBusa(String key) {
-    Byte b = newDictionary.lookup(key);
+    Byte b = dictionary.lookup(key);
     if (b == null) {
       return null;
     }
-    char flags = newDictionary.getFlags(b);
+    char flags = dictionary.getFlags(b);
     if ((flags & WordEntry.BUSA) != 0 && (flags & WordEntry.NOUN) == 0) {
-      return newDictionary.decodeEntry(key, b, flags);
+      return dictionary.decodeEntry(key, b, flags);
     } else {
       return null;
     }
@@ -253,70 +211,4 @@ public class DictionaryUtil {
       }
     });
   }
-  
-  private static List<CompoundEntry> compoundArrayToList(String source, String[] arr) {
-    List<CompoundEntry> list = new ArrayList<CompoundEntry>();
-    for(String str: arr) {
-      list.add(new CompoundEntry(str, true));
-    }
-    return list;
-  }
-  
-  // TODO: move all this to build time
-  private static int parseFlags(String buffer) {
-    if (buffer.length() != 10) {
-      throw new IllegalArgumentException("Invalid flags: " + buffer);
-    }
-    int flags = 0;
-    // IDX_NOUN: 1 if noun, 2 if compound
-    if (buffer.charAt(0) == '2') {
-      flags |= WordEntry.COMPOUND | WordEntry.NOUN;
-    } else if (buffer.charAt(0) == '1') {
-      flags |= WordEntry.NOUN;
-    } else if (buffer.charAt(0) != '0') {
-      throw new IllegalArgumentException("Invalid flags: " + buffer);
-    }
-    // IDX_VERB
-    if (parseBoolean(buffer, 1)) {
-      flags |= WordEntry.VERB;
-    }
-    // IDX_BUSA
-    if (parseBoolean(buffer, 2)) {
-      flags |= WordEntry.BUSA;
-    }
-    // IDX_DOV
-    if (parseBoolean(buffer, 3)) {
-      flags |= WordEntry.DOV;
-    }
-    // IDX_BEV
-    if (parseBoolean(buffer, 4)) {
-      flags |= WordEntry.BEV;
-    }
-    // IDX_NE
-    if (parseBoolean(buffer, 5)) {
-      flags |= WordEntry.NE;
-    }
-    // IDX_REGURA
-    switch(buffer.charAt(9)) {
-      case 'B': return flags | WordEntry.VERB_TYPE_BIUP;
-      case 'H': return flags | WordEntry.VERB_TYPE_HIOOT;
-      case 'U': return flags | WordEntry.VERB_TYPE_LIUL;
-      case 'L': return flags | WordEntry.VERB_TYPE_LOO;
-      case 'S': return flags | WordEntry.VERB_TYPE_SIUT;
-      case 'D': return flags | WordEntry.VERB_TYPE_DI;
-      case 'R': return flags | WordEntry.VERB_TYPE_RU;
-      case 'X': return flags | WordEntry.VERB_TYPE_REGULAR;
-      default: throw new IllegalArgumentException("Invalid flags: " + buffer);
-    }
-  }
-  
-  private static boolean parseBoolean(String buffer, int position) {
-    if (buffer.charAt(position) == '1') {
-      return true;
-    } else if (buffer.charAt(position) == '0') {
-      return false;
-    } else {
-      throw new IllegalArgumentException("Invalid flags: " + buffer);
-    }
-  }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/HangulDictionary.java Mon Oct 21 12:31:34 2013
@@ -118,4 +118,21 @@ class HangulDictionary {
     compounds.add(new CompoundEntry(sb.toString(), true));
     return compounds;
   }
+  
+  boolean hasPrefix(CharSequence key) {
+    final FST.Arc<Byte> arc = fst.getFirstArc(new FST.Arc<Byte>());
+
+    final BytesReader fstReader = fst.getBytesReader();
+
+    for (int i = 0; i < key.length(); i++) {
+      try {
+        if (fst.findTargetArc(key.charAt(i), arc, arc, fstReader) == null) {
+          return false;
+        }
+      } catch (IOException bogus) {
+        throw new RuntimeException();
+      }
+    }
+    return true;
+  }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java Mon Oct 21 12:31:34 2013
@@ -53,7 +53,7 @@ public class WordSpaceAnalyzer {
       char ch = input.charAt(i);
       
       String prefix = i==input.length()-1 ? "X" : input.substring(wStart,i+2);          
-      Iterator<String[]> iter = DictionaryUtil.findWithPrefix(prefix);
+      boolean prefixExists = DictionaryUtil.hasWordPrefix(prefix);
       
       List<AnalysisOutput> candidates = new ArrayList<AnalysisOutput>();    
       
@@ -65,10 +65,10 @@ public class WordSpaceAnalyzer {
       // 다음 음절이 2음절 이상 단어에 포함되어 있고 마지막 음절이 아니라면   띄워쓰기 위치가 아닐 가능성이 크다.
       // 부사, 관형사, 감탄사 등 단일어일 가능성인 경우 띄워쓰기가 가능하나, 
       // 이 경우는 다음 음절을 조사하여 
-      } else if(i!= input.length()-1 && iter.hasNext()) { 
+      } else if(i!= input.length()-1 && prefixExists) { 
         // 아무짓도 하지 않음.
         sgCount = i;
-      } else if(!iter.hasNext() && 
+      } else if(!prefixExists && 
           (entry=DictionaryUtil.getBusa(input.substring(wStart,i+1)))!=null) {        
         candidates.add(buildSingleOutput(entry));
         
@@ -299,13 +299,15 @@ public class WordSpaceAnalyzer {
     // 동사앞에 명사분리
     int vstart = 0;
     for(int i=estart-1;i>=0;i--) {  
-      Iterator<String[]> iter = DictionaryUtil.findWithPrefix(snipt.substring(i,estart)); 
-      if(iter.hasNext()) vstart=i;
-      else break;
+      if (DictionaryUtil.hasWordPrefix(snipt.substring(i, estart))) {
+        vstart = i;
+      } else {
+        break;
+      }
     }
       
     if(snipt.length()>eend &&
-        DictionaryUtil.findWithPrefix(snipt.substring(vstart,eend+1)).hasNext()) 
+        DictionaryUtil.hasWordPrefix(snipt.substring(vstart,eend+1))) 
       return candidates;  // 다음음절까지 단어의 일부라면.. 분해를 안한다.
     
     String pvword = null;
@@ -475,7 +477,7 @@ public class WordSpaceAnalyzer {
       && DictionaryUtil.getNoun(o.getSource())!=null) {
       return -1;
     }else if(nEnd<input.length() && o.getScore()==AnalysisOutput.SCORE_ANALYSIS 
-      && DictionaryUtil.findWithPrefix(ejend+input.charAt(nEnd)).hasNext()) { // 루씬하ㄴ 글형태소분석기 방지
+      && DictionaryUtil.hasWordPrefix(ejend+input.charAt(nEnd))) { // 루씬하ㄴ 글형태소분석기 방지
       return -1;  
     }else if(po!=null&&po.getPatn()==PatternConstants.PTN_VM&&"ㅁ".equals(po.getEomi())&&
         o.getStem().equals("하")) { // 다짐 합니다 로 분리되는 것 방지

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java?rev=1534128&r1=1534127&r2=1534128&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/tools/java/org/apache/lucene/analysis/ko/dic/DictionaryBuilder.java Mon Oct 21 12:31:34 2013
@@ -49,10 +49,7 @@ import org.apache.lucene.util.packed.Pac
 public class DictionaryBuilder {
   public static void main(String args[]) throws Exception {
     String FILES_AS_IS[] = { 
-      DictionaryResources.FILE_COMPOUNDS,
-      DictionaryResources.FILE_DICTIONARY,
       DictionaryResources.FILE_EOMI,
-      DictionaryResources.FILE_EXTENSION,
       DictionaryResources.FILE_JOSA,
       DictionaryResources.FILE_PREFIX,
       DictionaryResources.FILE_SUFFIX,