You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 06:22:54 UTC

svn commit: r1534021 - in /lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: data/ java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/dic/ java/org/apache/lucene/analysis/ko/morph/ resources/org/apache/lucene/analysis/ko/dic/

Author: rmuir
Date: Mon Oct 21 04:22:53 2013
New Revision: 1534021

URL: http://svn.apache.org/r1534021
Log:
LUCENE-4956: clean up compound / feature processing a bit (more coming)

Modified:
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic
    lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic Mon Oct 21 04:22:53 2013
@@ -70,10 +70,6 @@
 결의문:결의,문:0000
 결의안:결의,안:0000
 결혼식:결혼,식:1000
-경영인:경영인:0000
-경영자:경영자:0000
-경영주:경영주:0000
-경영진:경영진:0000
 고교연맹전:고교,연맹전:0000
 골프장:골프,장:0000
 곰비늘고사리:곰비늘,고사리:0000
@@ -99,26 +95,21 @@
 고투자율:고,투자,율:0000
 경전철:경,전철:0000
 갓김치:갓,김치:0000
-갖저고리:갖저고리:0000
 밤하늘:밤,하늘:0000
 개개인:개,개인:0000
 개고기:개,고기:0000
 개고사리:개,고사리:0000
-개관적:개관적:0000
 개구리때:개구리,때:0000
 개구리밥:개구리,밥:0000
 개그맨:개그,맨:0000
 개기월식:개기,월식:0000
 개기일식:개기,일식:0000
 개념화:개념,화:1100
-개떡수제비:개떡수제비:0000
 개똥벌레:개똥,벌레:0000
 개똥지빠귀:개똥,지빠귀:0000
 개막식:개막,식:0000
-개망신:개망신:1000
 개머루덩굴:개머루,덩굴:0000
 개미굴:개미,굴:0000
-개발도상국:개발도상국:0000
 개발자:개발,자:0000
 개밥도둑:개,밥,도둑:0000
 개별화:개별,화:0000
@@ -128,7 +119,6 @@
 개선문:개선,문:0000
 개선안:개선,안:0000
 개성적:개성,적:0000
-개연성:개연성:0000
 개울가:개울,가:0000
 개울물:개울,물:0000
 개정령:개정,령:0000
@@ -143,7 +133,6 @@
 객관성:객관,성:0000
 객관적:객관,적:0000
 객관화:객관,화:1100
-객지살이:객지살이:1000
 갯마을:갯,마을:0000
 갯바위:갯,바위:0000
 갯버들:갯,버들:0000
@@ -158,7 +147,6 @@
 거족적:거족,적:0000
 거주지:거주,지:0000
 거지반:거지,반:0000
-거짓말:거짓말:1000
 거짓말쟁이:거짓,말,쟁이:0000
 거짓부렁거짓부렁:거짓,부렁:0000
 걱정거리:걱정,거리:0000
@@ -188,7 +176,6 @@
 결벽증:결벽,증:0000
 결사적:결사,적:0000
 겹간통:겹,간통:1000
-겹사돈:겹사돈:0000
 경각심:경각,심:0000
 경계선:경계,선:0000
 경계심:경계,심:0000
@@ -562,7 +549,6 @@
 금메달:금,메달:0000
 금반지:금,반지:0000
 금빛돌비늘:금빛,돌,비늘:0000
-금속성:금속성:0000
 금욕주의자:금욕,주의자:0000
 금융계:금융,계:0000
 금융업:금융,업:0000

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic Mon Oct 21 04:22:53 2013
@@ -5338,4 +5338,19 @@
 히오치,100110000X
 히포크라테스,100110000X
 히피스트럼,100110000X
-힌지,100110000X
\ No newline at end of file
+힌지,100110000X
+! moved from compounds.dic, previously decompounded to themselves
+경영인,100000000X
+경영자,100000000X
+경영주,100000000X
+경영진,100000000X
+갖저고리,100000000X
+개관적,100000000X
+개떡수제비,100000000X
+개망신,100100000X
+개발도상국,100000000X
+개연성,100000000X
+객지살이,100100000X
+거짓말,100100000X
+겹사돈,100000000X
+금속성,100000000X

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Mon Oct 21 04:22:53 2013
@@ -366,7 +366,7 @@ public final class KoreanFilter extends 
   private List<CompoundEntry> confirmCNoun(String input) {
     
     WordEntry cnoun = DictionaryUtil.getAllNoun(input);
-    if(cnoun!=null && cnoun.getFeature(WordEntry.IDX_NOUN)=='2') {
+    if(cnoun!=null && cnoun.isCompoundNoun()) {
       return cnoun.getCompounds();
     }
        

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 04:22:53 2013
@@ -58,7 +58,7 @@ public class DictionaryUtil {
             throw new IOException("Invalid file format: " + line);
           }
           
-          WordEntry entry = new WordEntry(infos[0].trim(),infos[1].toCharArray());
+          WordEntry entry = new WordEntry(infos[0].trim(),infos[1].toCharArray(), null);
           dictionary.add(entry.getWord(), entry);          
         }
       };
@@ -118,26 +118,22 @@ public class DictionaryUtil {
     return (WordEntry)dictionary.get(key);
   }
   
-  public static WordEntry getWordExceptVerb(String key) {    
-    WordEntry entry = getWord(key);    
-    if(entry==null) return null;
-    
-    if(entry.getFeature(WordEntry.IDX_NOUN)=='1'||
-        entry.getFeature(WordEntry.IDX_NOUN)=='2'||
-        entry.getFeature(WordEntry.IDX_BUSA)=='1'
-        ) 
+  public static WordEntry getWordExceptVerb(String key) {
+    WordEntry entry = getWord(key);
+    if (entry != null && (entry.isNoun() || entry.isAdverb())) {
       return entry;
-    
-    return null;
+    } else {
+      return null;
+    }
   }
   
   public static WordEntry getNoun(String key) {  
-
     WordEntry entry = getWord(key);
-    if(entry==null) return null;
-    
-    if(entry.getFeature(WordEntry.IDX_NOUN)=='1') return entry;
-    return null;
+    if (entry != null && entry.isNoun() && !entry.isCompoundNoun()) {
+      return entry;
+    } else {
+      return null;
+    }
   }
   
   /**
@@ -147,31 +143,30 @@ public class DictionaryUtil {
    * @return  WordEntry
    */
   public static WordEntry getAllNoun(String key) {  
-
     WordEntry entry = getWord(key);
-    if(entry==null) return null;
-
-    if(entry.getFeature(WordEntry.IDX_NOUN)=='1' || entry.getFeature(WordEntry.IDX_NOUN)=='2') return entry;
-    return null;
+    if (entry != null && entry.isNoun()) {
+      return entry;
+    } else {
+      return null;
+    }
   }
   
   public static WordEntry getVerb(String key) {
-    
     WordEntry entry = getWord(key);  
-    if(entry==null) return null;
-
-    if(entry.getFeature(WordEntry.IDX_VERB)=='1') {
+    if (entry != null && entry.isVerb()) {
       return entry;
+    } else {
+      return null;
     }
-    return null;
   }
   
   public static WordEntry getBusa(String key) {
     WordEntry entry = getWord(key);
-    if(entry==null) return null;
-
-    if(entry.getFeature(WordEntry.IDX_BUSA)=='1'&&entry.getFeature(WordEntry.IDX_NOUN)=='0') return entry;
-    return null;
+    if (entry != null && entry.isAdverb() && !entry.isNoun()) {
+      return entry;
+    } else {
+      return null;
+    }
   }
   
   public static WordEntry getUncompound(String key) {
@@ -225,9 +220,7 @@ public class DictionaryUtil {
   private static List<CompoundEntry> compoundArrayToList(String source, String[] arr) {
     List<CompoundEntry> list = new ArrayList<CompoundEntry>();
     for(String str: arr) {
-      CompoundEntry ce = new CompoundEntry(str);
-      ce.setOffset(source.indexOf(str));
-      list.add(ce);
+      list.add(new CompoundEntry(str, true));
     }
     return list;
   }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java Mon Oct 21 04:22:53 2013
@@ -21,67 +21,19 @@ package org.apache.lucene.analysis.ko.mo
  * 복합명사의 개별단어에 대한 정보를 담고있는 클래스 
  */
 public class CompoundEntry {
-  
-  private String word;
-  
-  private int offset = -1;
-  
-  private boolean exist = true;
-  
-  private char pos = PatternConstants.POS_NOUN;
-  
-  public CompoundEntry() {
-    
-  }
-  
-  public CompoundEntry(String w) {
-    this.word = w;
-  }
-  
-  public CompoundEntry(String w,int o) {
-    this(w);
-    this.offset = o;    
-  }
-  
-  public CompoundEntry(String w,int o, boolean is) {
-    this(w,o);
-    this.exist = is;    
-  }
-  
-  public CompoundEntry(String w,int o, boolean is, char p) {
-    this(w,o,is);
-    this.pos = p;
-  }
-  
-  public void setWord(String w) {
-    this.word = w;
-  }
-  
-  public void setOffset(int o) {
-    this.offset = o;
+  private final String word;
+  private final boolean exist;
+
+  public CompoundEntry(String word, boolean exist) {
+    this.word = word;
+    this.exist = exist;    
   }
   
   public String getWord() {
-    return this.word;
-  }
-  
-  public int getOffset() {
-    return this.offset;
+    return word;
   }
   
   public boolean isExist() {
     return exist;
   }
-  
-  public void setExist(boolean is) {
-    this.exist = is;
-  }
-  
-  public char getPos() {
-    return pos;
-  }
-
-  public void setPos(char pos) {
-    this.pos = pos;
-  }  
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Mon Oct 21 04:22:53 2013
@@ -41,7 +41,7 @@ public class CompoundNounAnalyzer {
   public List<CompoundEntry> analyze(String input) {
     
     WordEntry entry = DictionaryUtil.getAllNoun(input);
-    if(entry!=null && entry.getCompounds().size()>0) 
+    if(entry!=null && entry.isCompoundNoun()) 
       return entry.getCompounds();
     
     return analyze(input,true);
@@ -221,12 +221,12 @@ public class CompoundNounAnalyzer {
     if(pos==input.length()) {     
       if(hasSuffix) {
         outputs.add(
-            new CompoundEntry(input.substring(0,len-1), 0, true,PatternConstants.POS_NOUN));
+            new CompoundEntry(input.substring(0,len-1), true));
         outputs.add(
-            new CompoundEntry(input.substring(len-1), 0, true,PatternConstants.POS_NOUN));
+            new CompoundEntry(input.substring(len-1), true));
       } else {
         outputs.add(
-            new CompoundEntry(input, 0, true,PatternConstants.POS_NOUN));
+            new CompoundEntry(input, true));
 
       } 
       
@@ -243,25 +243,25 @@ public class CompoundNounAnalyzer {
     WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);
     if(prvEntry==null) {
       pSucess = analyze(prev, results, false);
-      if(!pSucess) results.add(new CompoundEntry(prev, 0, false,PatternConstants.POS_NOUN));
+      if(!pSucess) results.add(new CompoundEntry(prev, false));
     } else {
       pSucess = true;
-      if(prvEntry.getFeature(WordEntry.IDX_NOUN)=='2')
+      if(prvEntry.isCompoundNoun())
         results.addAll(prvEntry.getCompounds());
       else
-        results.add(new CompoundEntry(prev, 0, true,PatternConstants.POS_NOUN));
+        results.add(new CompoundEntry(prev, true));
     }
     
     WordEntry rearEntry = DictionaryUtil.getAllNoun(rear);
     if(rearEntry==null) {
       rSuccess = analyze(rear, results, false);
-      if(!rSuccess) results.add(new CompoundEntry(rear, 0, false,PatternConstants.POS_NOUN));
+      if(!rSuccess) results.add(new CompoundEntry(rear, false));
     } else {
       rSuccess = true;
-      if(rearEntry.getFeature(WordEntry.IDX_NOUN)=='2')
+      if(rearEntry.isCompoundNoun())
         results.addAll(rearEntry.getCompounds());
       else
-        results.add(new CompoundEntry(rear, 0, true,PatternConstants.POS_NOUN));
+        results.add(new CompoundEntry(rear, true));
     }
     
     if(!pSucess&&!rSuccess) {
@@ -365,20 +365,11 @@ public class CompoundNounAnalyzer {
    * @return compound entry
    */
   private CompoundEntry analyzeSingle(String input) {
-            
-    int score = AnalysisOutput.SCORE_ANALYSIS;
-    char pos = PatternConstants.POS_NOUN;
-    if(input.length()==1) return  new CompoundEntry(input, 0, true,pos);
+    if(input.length()==1) return  new CompoundEntry(input, true);
     
     WordEntry entry = DictionaryUtil.getWordExceptVerb(input);
-    if(entry!=null) {
-      score = AnalysisOutput.SCORE_CORRECT;
-      if(entry.getFeature(WordEntry.IDX_NOUN)!='1') {
-        pos = PatternConstants.POS_AID;
-      }
-    }
 
-    return new CompoundEntry(input, 0, score==AnalysisOutput.SCORE_CORRECT,pos);
+    return new CompoundEntry(input, entry != null);
   }
   
   private static boolean isAlphaNumeric(String text) {

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java Mon Oct 21 04:22:53 2013
@@ -26,33 +26,6 @@ import org.apache.lucene.analysis.ko.dic
 class IrregularUtil {
   private IrregularUtil() {}
   
-  // ㅂ 불규칙
-  public static final char IRR_TYPE_BIUP = 'B';
-  
-  // ㅎ 불규칙
-  public static final char IRR_TYPE_HIOOT = 'H';
-  
-  // ㄹ 불규칙
-  public static final char IRR_TYPE_LIUL = 'U';
-  
-  // 르 불규칙
-  public static final char IRR_TYPE_LOO = 'L';
-
-  // ㅅ 불규칙
-  public static final char IRR_TYPE_SIUT = 'S';
-  
-  // ㄷ 불규칙
-  public static final char IRR_TYPE_DI = 'D';
-  
-  // 러 불규칙
-  public static final char IRR_TYPE_RU = 'R';
-  
-  // 으 탈락
-  public static final char IRR_TYPE_UI = 'X';  
-  
-  // 규칙형
-  public static final char IRR_TYPE_REGULAR = 'X';
-  
   public static String[] restoreIrregularVerb(String start, String end) {
 
     if(end==null) end="";
@@ -152,7 +125,7 @@ class IrregularUtil {
         start = Character.toString(ch);    
 
       WordEntry entry = DictionaryUtil.getVerb(start);
-      if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_BIUP)
+      if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_BIUP)
         return new String[]{start,end};      
     }
 
@@ -186,7 +159,7 @@ class IrregularUtil {
       start = Character.toString(ch);
     
     WordEntry entry = DictionaryUtil.getVerb(start);
-    if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_DI)
+    if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_DI)
       return new String[]{start,end};
     
     return null;
@@ -211,7 +184,7 @@ class IrregularUtil {
       start = Character.toString(ch);
     
     WordEntry entry = DictionaryUtil.getVerb(start);
-    if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_SIUT)
+    if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_SIUT)
       return new String[]{start,end};
 
     return null;
@@ -243,7 +216,7 @@ class IrregularUtil {
         sb.append(Character.toString(ch1)).append("르");
 
       WordEntry entry = DictionaryUtil.getVerb(sb.toString());
-      if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_LOO)
+      if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_LOO)
         return new String[]{sb.toString(),end};    
     }
     
@@ -271,7 +244,7 @@ class IrregularUtil {
     start = start.substring(0,start.length()-1)+convEnd;
 
     WordEntry entry = DictionaryUtil.getVerb(start);
-    if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_LIUL)
+    if (entry!=null && entry.getVerbType() == WordEntry.VERB_TYPE_LIUL)
       return new String[]{start,end};  
     
     return null;
@@ -300,7 +273,7 @@ class IrregularUtil {
       start = Character.toString(ch2);
 
     WordEntry entry = DictionaryUtil.getVerb(start);
-    if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_RU)
+    if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_RU)
       return new String[]{start,end};
     
     return null;
@@ -332,7 +305,7 @@ class IrregularUtil {
       start = Character.toString(ch2);
 
     WordEntry entry = DictionaryUtil.getVerb(start);
-    if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_HIOOT)
+    if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_HIOOT)
       return new String[]{start,end};
     
     return null;

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java Mon Oct 21 04:22:53 2013
@@ -318,7 +318,7 @@ public class MorphAnalyzer {
         output.setPos(PatternConstants.POS_ETC);
         output.setPatn(PatternConstants.PTN_ADVJ);
       }
-      if(entry.getCompounds().size()>1) output.addCNoun(entry.getCompounds());
+      if(entry.isCompoundNoun()) output.addCNoun(entry.getCompounds());
     }else {
       if(MorphUtil.hasVerbOnly(stem)) return;
     }
@@ -351,7 +351,7 @@ public class MorphAnalyzer {
     o.setPomi(pomis[1]);
   
     WordEntry entry = DictionaryUtil.getVerb(o.getStem());  
-    if(entry!=null&&!("을".equals(end)&&entry.getFeature(WordEntry.IDX_REGURA)==IrregularUtil.IRR_TYPE_LIUL)) {              
+    if(entry!=null&&!("을".equals(end)&& entry.getVerbType() == WordEntry.VERB_TYPE_LIUL)) {              
       AnalysisOutput output = o.clone();
       output.setScore(AnalysisOutput.SCORE_CORRECT);
       MorphUtil.buildPtnVM(output, candidates);

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java Mon Oct 21 04:22:53 2013
@@ -231,10 +231,10 @@ class NounUtil {
           
     WordEntry cnoun = DictionaryUtil.getAllNoun(s);
     if(cnoun != null)  {
-      if(cnoun.getFeature(WordEntry.IDX_NOUN)=='2')
+      if(cnoun.isCompoundNoun())
         output.setCNoun(cnoun.getCompounds());
       else
-        output.setCNoun(new ArrayList<CompoundEntry>());
+        output.setCNoun(new ArrayList<CompoundEntry>()); // TODO: dont make all these lists
       output.setScore(AnalysisOutput.SCORE_CORRECT);
     }
           

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java Mon Oct 21 04:22:53 2013
@@ -38,58 +38,12 @@ public interface PatternConstants {
   public static int PTN_AID =  21;  //* 단일어 : 부사, 관형사, 감탄사 */
   public static int PTN_ADVJ =  22;  //* 부사 + 조사 : '빨리도' */
 
-  public static int PTN_NVM =  31;  //* 체언 + 동사 + 어미 */
-
-  public static int PTN_ZZZ =  35;  //* 문장부호, KS 완성형 기호열, 단독조사/어미 */  
-  
   //*          CLASSIFICATION OF PARTS OF SPEECH               */
-  //  3(basic) + 2(special) types of stem for 'pos'
-  public static char POS_NPXM  =   'N';       //* noun, pnoun, xn, nume */
-  public static char POS_VJXV  =   'V';       //* verb, adj, xverb      */
+  // types of stem for 'pos'
   public static char POS_AID   =   'Z';       //* adv, det, excl        */
 
-  public static char POS_PUNC  =   'q';       //* punctuation mark:./,/( */
-  public static char POS_SYMB  =   'Q';       //* special symbols       */
-
   //  normal types of stem for 'pos2'.
-  //  Only some of following symbols are used.
   public static char POS_NOUN  =   'N';       //* noun                  */
-  public static char POS_PNOUN  =  'P';       //* pronoun               */
-  public static char POS_XNOUN  =  'U';       //* dependent noun        */
-  public static char POS_NUMERAL = 'M';       //* numeral               */
-
-  public static char POS_PROPER  = 'O';       //* proper noun: NOT USED */
-
-  public static char POS_CNOUN  =  'C';       //* compound noun guessed */
-  public static char POS_NOUNK  =  'u';       //* guessed as noun       */
-
-  public static char POS_ASCall =  '@';       //* all alphanumeric chars*/
-  public static char POS_ASCend =  '$';       //* end with alphanumeric */
-  public static char POS_ASCmid =  '*';       //* ..+alphanumeric+Hangul*/
-
-  //* defined for numeral to digit conversion */
-  public static char POS_digits =  '1';       //* digit-string */
-  public static char POS_digitH  = '2';       //* digit-string + Hangul*/
-
   public static char POS_VERB  =   'V';       //* verb                  */
-  public static char POS_ADJ   =   'J';       //* adjective             */
-  public static char POS_XVERB =   'W';       //* auxiliary verb        */
-  public static char POS_XADJ  =   'K';       //* NOT USED YET          */
-
-  public static char POS_ADV   =   'B';       //* adverb                */
-  public static char POS_DET   =   'D';       //* determiner            */
-  public static char POS_EXCL  =   'L';       //* exclamation           */
-
-  public static char POS_JOSA   =  'j';       //* Korean Josa           */
-  public static char POS_COPULA =  'c';       //* copula '-Wi-'         */
-  public static char POS_EOMI   =  'e';       //* final Ending          */
-  public static char POS_PEOMI  =  'f';       //* prefinal Ending       */
-  public static char POS_NEOMI  =  'n';       //* nominalizing Eomi     */
-
-  public static char POS_PREFIX =  'p';       //* prefixes              */
-  public static char POS_SFX_N  =  's';       //* noun suffixes: '들/적'*/
-  public static char POS_SFX_V  =  't';       //* verb suffixes: '하/되'*/
-
   public static char POS_ETC   =   'Z';       //* not decided yet       */
-
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java Mon Oct 21 04:22:53 2013
@@ -101,7 +101,7 @@ class VerbUtil {
     
     WordEntry entry = null;
     if(success&&(entry=DictionaryUtil.getAllNoun(o.getStem()))!=null) { 
-      if(entry.getFeature(WordEntry.IDX_NOUN)=='2') {
+      if(entry.isCompoundNoun()) {
         o.setCNoun(entry.getCompounds());
       }
       o.setScore(AnalysisOutput.SCORE_CORRECT);
@@ -191,7 +191,7 @@ class VerbUtil {
     if(o.getVsfx().equals("되")&&entry.getFeature(WordEntry.IDX_BEV)!='1') return false;        
     
     o.setScore(AnalysisOutput.SCORE_CORRECT);
-    if(entry.getFeature(WordEntry.IDX_NOUN)=='2') {
+    if(entry.isCompoundNoun()) {
       o.setCNoun(entry.getCompounds());
     }
     

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java Mon Oct 21 04:22:53 2013
@@ -22,16 +22,37 @@ import java.util.List;
 
 public class WordEntry {
 
-  public static final int IDX_NOUN = 0;
-  public static final int IDX_VERB = 1;
-  public static final int IDX_BUSA = 2;
+  static final int IDX_NOUN = 0;
+  static final int IDX_VERB = 1;
+  static final int IDX_BUSA = 2;
   public static final int IDX_DOV = 3;
   public static final int IDX_BEV = 4;
   public static final int IDX_NE = 5;
-  public static final int IDX_ADJ = 6; // 형용사
-  public static final int IDX_NPR = 7;  // 명사의 분류 (M:Measure)
-  public static final int IDX_CNOUNX = 8; 
-  public static final int IDX_REGURA = 9;
+  static final int IDX_REGURA = 9;
+  
+  /** Irregular verb type (ㅂ-final) */
+  public static final int VERB_TYPE_BIUP = 'B';
+  
+  /** Irregular verb type (ㅎ-final) */
+  public static final int VERB_TYPE_HIOOT = 'H';
+  
+  /** Irregular verb type (ㄹ-final) */
+  public static final int VERB_TYPE_LIUL = 'U';
+  
+  /** Irregular verb type (르-final) */
+  public static final int VERB_TYPE_LOO = 'L';
+
+  /** Irregular verb type (ㅅ-final) */
+  public static final int VERB_TYPE_SIUT = 'S';
+  
+  /** Irregular verb type (ㄷ-final) */
+  public static final int VERB_TYPE_DI = 'D';
+  
+  /** Irregular verb type (러-final) */
+  public static final int VERB_TYPE_RU = 'R';
+  
+  /** Regular verb type */
+  public static final int VERB_TYPE_REGULAR = 'X';
   
   /**
    * 단어
@@ -45,21 +66,17 @@ public class WordEntry {
   
   private final List<CompoundEntry> compounds;
   
-  public WordEntry(String word) {
-    this(word, null);
-  }
-  
-  public WordEntry(String word, char[] cs) {
-    this(word, cs, Collections.<CompoundEntry>emptyList());
-  }
-  
   public WordEntry(String word, char[] cs, List<CompoundEntry> compounds) {
     if (cs.length != 10) {
       throw new IllegalArgumentException("invalid features for word: " + word + ", got:" + new String(cs));
     } 
     this.word = word;
     this.features = cs;
-    this.compounds = Collections.unmodifiableList(compounds);
+    this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
+    // has compound list iff compound feature is set ('2' in main dictionary, '9' in uncompounds)
+    // TODO: implement validCompound check differently: uncompounds shouldnt use wordentry
+    assert (features[IDX_NOUN] >= '2' && compounds != null && compounds.size() > 1) 
+        || (features[IDX_NOUN] <= '2' && compounds == null) : "inconsistent compound data for word: " + word;
   }
   
   public String getWord() {
@@ -67,15 +84,37 @@ public class WordEntry {
   }
   
   public char getFeature(int index) {
-    if(features==null||features.length<index) return '0';    
     return features[index];
   }
   
-  public char[] getFeatures() {
-    return this.features;
+  /** Returns true if the entry is a noun (or compound noun) */
+  public boolean isNoun() {
+    return features[IDX_NOUN] != '0';
   }
   
+  /** Returns true if entry is a compound noun */
+  public boolean isCompoundNoun() {
+    return features[IDX_NOUN] >= '2';
+  }
+  
+  /** Returns List of compounds for word */
   public List<CompoundEntry> getCompounds() {
-    return this.compounds;
+    assert isCompoundNoun();
+    return compounds;
+  }
+  
+  /** Returns true if entry is verb */
+  public boolean isVerb() {
+    return features[IDX_VERB] == '1';
+  }
+  
+  /** Returns verb type (IRR_TYPE_REGULAR or irregular type) */
+  public int getVerbType() {
+    return features[IDX_REGURA];
+  }
+  
+  /** Returns true if entry is busa (adverb) */
+  public boolean isAdverb() {
+    return features[IDX_BUSA] == '1';
   }
 }

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java Mon Oct 21 04:22:53 2013
@@ -345,12 +345,12 @@ public class WordSpaceAnalyzer {
       List<CompoundEntry> cnouns = o.getCNounList();
       if(cnouns.size()==0) {
         boolean is = DictionaryUtil.getWordExceptVerb(pvword)!=null;
-        cnouns.add(new CompoundEntry(pvword,0,is));
+        cnouns.add(new CompoundEntry(pvword, is));
       } 
       
       for(AnalysisOutput candidate : candidates) {
         candidate.getCNounList().addAll(cnouns);
-        candidate.getCNounList().add(new CompoundEntry(candidate.getStem(),0,true));
+        candidate.getCNounList().add(new CompoundEntry(candidate.getStem(), true));
         candidate.setStem(pvword+candidate.getStem()); // 이렇게 해야 WSOutput 에 복합명사 처리할 때 정상처리됨
       }
       
@@ -444,10 +444,7 @@ public class WordSpaceAnalyzer {
     AnalysisOutput o = candidates.remove(0);    
     AnalysisOutput po = output.getPhrases().size()>0 ?  output.getPhrases().get(output.getPhrases().size()-1) : null;
     
-    String ejend = o.getSource().substring(o.getStem().length());
-    
-    char[] chrs = po!=null&&po.getStem().length()>0 ? MorphUtil.decompose(po.getStem().charAt(po.getStem().length()-1)) : null;
-    String pjend = po!=null&&po.getStem().length()>0 ? po.getSource().substring(po.getStem().length()) : null;
+    String ejend = o.getSource().substring(o.getStem().length());    
     
     char ja = 'x'; // 임의의 문자
     if(po!=null&&(po.getPatn()==PatternConstants.PTN_VM||po.getPatn()==PatternConstants.PTN_VMCM||po.getPatn()==PatternConstants.PTN_VMXM)) {   

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic Mon Oct 21 04:22:53 2013
@@ -70,10 +70,6 @@
 결의문:결의,문:0000
 결의안:결의,안:0000
 결혼식:결혼,식:1000
-경영인:경영인:0000
-경영자:경영자:0000
-경영주:경영주:0000
-경영진:경영진:0000
 고교연맹전:고교,연맹전:0000
 골프장:골프,장:0000
 곰비늘고사리:곰비늘,고사리:0000
@@ -99,26 +95,21 @@
 고투자율:고,투자,율:0000
 경전철:경,전철:0000
 갓김치:갓,김치:0000
-갖저고리:갖저고리:0000
 밤하늘:밤,하늘:0000
 개개인:개,개인:0000
 개고기:개,고기:0000
 개고사리:개,고사리:0000
-개관적:개관적:0000
 개구리때:개구리,때:0000
 개구리밥:개구리,밥:0000
 개그맨:개그,맨:0000
 개기월식:개기,월식:0000
 개기일식:개기,일식:0000
 개념화:개념,화:1100
-개떡수제비:개떡수제비:0000
 개똥벌레:개똥,벌레:0000
 개똥지빠귀:개똥,지빠귀:0000
 개막식:개막,식:0000
-개망신:개망신:1000
 개머루덩굴:개머루,덩굴:0000
 개미굴:개미,굴:0000
-개발도상국:개발도상국:0000
 개발자:개발,자:0000
 개밥도둑:개,밥,도둑:0000
 개별화:개별,화:0000
@@ -128,7 +119,6 @@
 개선문:개선,문:0000
 개선안:개선,안:0000
 개성적:개성,적:0000
-개연성:개연성:0000
 개울가:개울,가:0000
 개울물:개울,물:0000
 개정령:개정,령:0000
@@ -143,7 +133,6 @@
 객관성:객관,성:0000
 객관적:객관,적:0000
 객관화:객관,화:1100
-객지살이:객지살이:1000
 갯마을:갯,마을:0000
 갯바위:갯,바위:0000
 갯버들:갯,버들:0000
@@ -158,7 +147,6 @@
 거족적:거족,적:0000
 거주지:거주,지:0000
 거지반:거지,반:0000
-거짓말:거짓말:1000
 거짓말쟁이:거짓,말,쟁이:0000
 거짓부렁거짓부렁:거짓,부렁:0000
 걱정거리:걱정,거리:0000
@@ -188,7 +176,6 @@
 결벽증:결벽,증:0000
 결사적:결사,적:0000
 겹간통:겹,간통:1000
-겹사돈:겹사돈:0000
 경각심:경각,심:0000
 경계선:경계,선:0000
 경계심:경계,심:0000
@@ -562,7 +549,6 @@
 금메달:금,메달:0000
 금반지:금,반지:0000
 금빛돌비늘:금빛,돌,비늘:0000
-금속성:금속성:0000
 금욕주의자:금욕,주의자:0000
 금융계:금융,계:0000
 금융업:금융,업:0000

Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic Mon Oct 21 04:22:53 2013
@@ -5338,4 +5338,19 @@
 히오치,100110000X
 히포크라테스,100110000X
 히피스트럼,100110000X
-힌지,100110000X
\ No newline at end of file
+힌지,100110000X
+! compound exceptions
+경영인,100000000X
+경영자,100000000X
+경영주,100000000X
+경영진,100000000X
+갖저고리,100000000X
+개관적,100000000X
+개떡수제비,100000000X
+개망신,100100000X
+개발도상국,100000000X
+개연성,100000000X
+객지살이,100100000X
+거짓말,100100000X
+겹사돈,100000000X
+금속성,100000000X