You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 06:22:54 UTC
svn commit: r1534021 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src: data/
java/org/apache/lucene/analysis/ko/ java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/morph/
resources/org/apache/lucene/analysis/ko/dic/
Author: rmuir
Date: Mon Oct 21 04:22:53 2013
New Revision: 1534021
URL: http://svn.apache.org/r1534021
Log:
LUCENE-4956: clean up compound / feature processing a bit (more coming)
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/compounds.dic Mon Oct 21 04:22:53 2013
@@ -70,10 +70,6 @@
ê²°ì문:ê²°ì,문:0000
ê²°ìì:ê²°ì,ì:0000
ê²°í¼ì:ê²°í¼,ì:1000
-ê²½ìì¸:ê²½ìì¸:0000
-ê²½ìì:ê²½ìì:0000
-ê²½ì주:ê²½ì주:0000
-ê²½ìì§:ê²½ìì§:0000
ê³ êµì°ë§¹ì :ê³ êµ,ì°ë§¹ì :0000
골íì¥:골í,ì¥:0000
ê³°ë¹ëê³ ì¬ë¦¬:ê³°ë¹ë,ê³ ì¬ë¦¬:0000
@@ -99,26 +95,21 @@
ê³ í¬ìì¨:ê³ ,í¬ì,ì¨:0000
ê²½ì ì² :ê²½,ì ì² :0000
ê°ê¹ì¹:ê°,ê¹ì¹:0000
-ê°ì ê³ ë¦¬:ê°ì ê³ ë¦¬:0000
ë°¤íë:ë°¤,íë:0000
ê°ê°ì¸:ê°,ê°ì¸:0000
ê°ê³ 기:ê°,ê³ ê¸°:0000
ê°ê³ ì¬ë¦¬:ê°,ê³ ì¬ë¦¬:0000
-ê°ê´ì :ê°ê´ì :0000
ê°êµ¬ë¦¬ë:ê°êµ¬ë¦¬,ë:0000
ê°êµ¬ë¦¬ë°¥:ê°êµ¬ë¦¬,ë°¥:0000
ê°ê·¸ë§¨:ê°ê·¸,맨:0000
ê°ê¸°ìì:ê°ê¸°,ìì:0000
ê°ê¸°ì¼ì:ê°ê¸°,ì¼ì:0000
ê°ë
í:ê°ë
,í:1100
-ê°ë¡ìì ë¹:ê°ë¡ìì ë¹:0000
ê°ë¥ë²ë :ê°ë¥,ë²ë :0000
ê°ë¥ì§ë¹ ê·:ê°ë¥,ì§ë¹ ê·:0000
ê°ë§ì:ê°ë§,ì:0000
-ê°ë§ì :ê°ë§ì :1000
ê°ë¨¸ë£¨ë©êµ´:ê°ë¨¸ë£¨,ë©êµ´:0000
ê°ë¯¸êµ´:ê°ë¯¸,êµ´:0000
-ê°ë°ëìêµ:ê°ë°ëìêµ:0000
ê°ë°ì:ê°ë°,ì:0000
ê°ë°¥ëë:ê°,ë°¥,ëë:0000
ê°ë³í:ê°ë³,í:0000
@@ -128,7 +119,6 @@
ê°ì 문:ê°ì ,문:0000
ê°ì ì:ê°ì ,ì:0000
ê°ì±ì :ê°ì±,ì :0000
-ê°ì°ì±:ê°ì°ì±:0000
ê°ì¸ê°:ê°ì¸,ê°:0000
ê°ì¸ë¬¼:ê°ì¸,물:0000
ê°ì ë ¹:ê°ì ,ë ¹:0000
@@ -143,7 +133,6 @@
ê°ê´ì±:ê°ê´,ì±:0000
ê°ê´ì :ê°ê´,ì :0000
ê°ê´í:ê°ê´,í:1100
-ê°ì§ì´ì´:ê°ì§ì´ì´:1000
ê°¯ë§ì:ê°¯,ë§ì:0000
ê°¯ë°ì:ê°¯,ë°ì:0000
ê°¯ë²ë¤:ê°¯,ë²ë¤:0000
@@ -158,7 +147,6 @@
거족ì :거족,ì :0000
거주ì§:거주,ì§:0000
ê±°ì§ë°:ê±°ì§,ë°:0000
-ê±°ì§ë§:ê±°ì§ë§:1000
ê±°ì§ë§ìì´:ê±°ì§,ë§,ìì´:0000
ê±°ì§ë¶ë ê±°ì§ë¶ë :ê±°ì§,ë¶ë :0000
ê±±ì 거리:ê±±ì ,거리:0000
@@ -188,7 +176,6 @@
ê²°ë²½ì¦:ê²°ë²½,ì¦:0000
ê²°ì¬ì :ê²°ì¬,ì :0000
ê²¹ê°íµ:ê²¹,ê°íµ:1000
-ê²¹ì¬ë:ê²¹ì¬ë:0000
ê²½ê°ì¬:ê²½ê°,ì¬:0000
ê²½ê³ì :ê²½ê³,ì :0000
ê²½ê³ì¬:ê²½ê³,ì¬:0000
@@ -562,7 +549,6 @@
ê¸ë©ë¬:ê¸,ë©ë¬:0000
ê¸ë°ì§:ê¸,ë°ì§:0000
ê¸ë¹ëë¹ë:ê¸ë¹,ë,ë¹ë:0000
-ê¸ìì±:ê¸ìì±:0000
ê¸ì주ìì:ê¸ì,주ìì:0000
ê¸ìµê³:ê¸ìµ,ê³:0000
ê¸ìµì
:ê¸ìµ,ì
:0000
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/data/extension.dic Mon Oct 21 04:22:53 2013
@@ -5338,4 +5338,19 @@
íì¤ì¹,100110000X
íí¬í¬ë¼í
ì¤,100110000X
íí¼ì¤í¸ë¼,100110000X
-íì§,100110000X
\ No newline at end of file
+íì§,100110000X
+! moved from compounds.dic, previously decompounded to themselves
+ê²½ìì¸,100000000X
+ê²½ìì,100000000X
+ê²½ì주,100000000X
+ê²½ìì§,100000000X
+ê°ì ê³ ë¦¬,100000000X
+ê°ê´ì ,100000000X
+ê°ë¡ìì ë¹,100000000X
+ê°ë§ì ,100100000X
+ê°ë°ëìêµ,100000000X
+ê°ì°ì±,100000000X
+ê°ì§ì´ì´,100100000X
+ê±°ì§ë§,100100000X
+ê²¹ì¬ë,100000000X
+ê¸ìì±,100000000X
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/KoreanFilter.java Mon Oct 21 04:22:53 2013
@@ -366,7 +366,7 @@ public final class KoreanFilter extends
private List<CompoundEntry> confirmCNoun(String input) {
WordEntry cnoun = DictionaryUtil.getAllNoun(input);
- if(cnoun!=null && cnoun.getFeature(WordEntry.IDX_NOUN)=='2') {
+ if(cnoun!=null && cnoun.isCompoundNoun()) {
return cnoun.getCompounds();
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryUtil.java Mon Oct 21 04:22:53 2013
@@ -58,7 +58,7 @@ public class DictionaryUtil {
throw new IOException("Invalid file format: " + line);
}
- WordEntry entry = new WordEntry(infos[0].trim(),infos[1].toCharArray());
+ WordEntry entry = new WordEntry(infos[0].trim(),infos[1].toCharArray(), null);
dictionary.add(entry.getWord(), entry);
}
};
@@ -118,26 +118,22 @@ public class DictionaryUtil {
return (WordEntry)dictionary.get(key);
}
- public static WordEntry getWordExceptVerb(String key) {
- WordEntry entry = getWord(key);
- if(entry==null) return null;
-
- if(entry.getFeature(WordEntry.IDX_NOUN)=='1'||
- entry.getFeature(WordEntry.IDX_NOUN)=='2'||
- entry.getFeature(WordEntry.IDX_BUSA)=='1'
- )
+ public static WordEntry getWordExceptVerb(String key) {
+ WordEntry entry = getWord(key);
+ if (entry != null && (entry.isNoun() || entry.isAdverb())) {
return entry;
-
- return null;
+ } else {
+ return null;
+ }
}
public static WordEntry getNoun(String key) {
-
WordEntry entry = getWord(key);
- if(entry==null) return null;
-
- if(entry.getFeature(WordEntry.IDX_NOUN)=='1') return entry;
- return null;
+ if (entry != null && entry.isNoun() && !entry.isCompoundNoun()) {
+ return entry;
+ } else {
+ return null;
+ }
}
/**
@@ -147,31 +143,30 @@ public class DictionaryUtil {
* @return WordEntry
*/
public static WordEntry getAllNoun(String key) {
-
WordEntry entry = getWord(key);
- if(entry==null) return null;
-
- if(entry.getFeature(WordEntry.IDX_NOUN)=='1' || entry.getFeature(WordEntry.IDX_NOUN)=='2') return entry;
- return null;
+ if (entry != null && entry.isNoun()) {
+ return entry;
+ } else {
+ return null;
+ }
}
public static WordEntry getVerb(String key) {
-
WordEntry entry = getWord(key);
- if(entry==null) return null;
-
- if(entry.getFeature(WordEntry.IDX_VERB)=='1') {
+ if (entry != null && entry.isVerb()) {
return entry;
+ } else {
+ return null;
}
- return null;
}
public static WordEntry getBusa(String key) {
WordEntry entry = getWord(key);
- if(entry==null) return null;
-
- if(entry.getFeature(WordEntry.IDX_BUSA)=='1'&&entry.getFeature(WordEntry.IDX_NOUN)=='0') return entry;
- return null;
+ if (entry != null && entry.isAdverb() && !entry.isNoun()) {
+ return entry;
+ } else {
+ return null;
+ }
}
public static WordEntry getUncompound(String key) {
@@ -225,9 +220,7 @@ public class DictionaryUtil {
private static List<CompoundEntry> compoundArrayToList(String source, String[] arr) {
List<CompoundEntry> list = new ArrayList<CompoundEntry>();
for(String str: arr) {
- CompoundEntry ce = new CompoundEntry(str);
- ce.setOffset(source.indexOf(str));
- list.add(ce);
+ list.add(new CompoundEntry(str, true));
}
return list;
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundEntry.java Mon Oct 21 04:22:53 2013
@@ -21,67 +21,19 @@ package org.apache.lucene.analysis.ko.mo
* ë³µí©ëª
ì¬ì ê°ë³ë¨ì´ì ëí ì 보를 ë´ê³ ìë í´ëì¤
*/
public class CompoundEntry {
-
- private String word;
-
- private int offset = -1;
-
- private boolean exist = true;
-
- private char pos = PatternConstants.POS_NOUN;
-
- public CompoundEntry() {
-
- }
-
- public CompoundEntry(String w) {
- this.word = w;
- }
-
- public CompoundEntry(String w,int o) {
- this(w);
- this.offset = o;
- }
-
- public CompoundEntry(String w,int o, boolean is) {
- this(w,o);
- this.exist = is;
- }
-
- public CompoundEntry(String w,int o, boolean is, char p) {
- this(w,o,is);
- this.pos = p;
- }
-
- public void setWord(String w) {
- this.word = w;
- }
-
- public void setOffset(int o) {
- this.offset = o;
+ private final String word;
+ private final boolean exist;
+
+ public CompoundEntry(String word, boolean exist) {
+ this.word = word;
+ this.exist = exist;
}
public String getWord() {
- return this.word;
- }
-
- public int getOffset() {
- return this.offset;
+ return word;
}
public boolean isExist() {
return exist;
}
-
- public void setExist(boolean is) {
- this.exist = is;
- }
-
- public char getPos() {
- return pos;
- }
-
- public void setPos(char pos) {
- this.pos = pos;
- }
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/CompoundNounAnalyzer.java Mon Oct 21 04:22:53 2013
@@ -41,7 +41,7 @@ public class CompoundNounAnalyzer {
public List<CompoundEntry> analyze(String input) {
WordEntry entry = DictionaryUtil.getAllNoun(input);
- if(entry!=null && entry.getCompounds().size()>0)
+ if(entry!=null && entry.isCompoundNoun())
return entry.getCompounds();
return analyze(input,true);
@@ -221,12 +221,12 @@ public class CompoundNounAnalyzer {
if(pos==input.length()) {
if(hasSuffix) {
outputs.add(
- new CompoundEntry(input.substring(0,len-1), 0, true,PatternConstants.POS_NOUN));
+ new CompoundEntry(input.substring(0,len-1), true));
outputs.add(
- new CompoundEntry(input.substring(len-1), 0, true,PatternConstants.POS_NOUN));
+ new CompoundEntry(input.substring(len-1), true));
} else {
outputs.add(
- new CompoundEntry(input, 0, true,PatternConstants.POS_NOUN));
+ new CompoundEntry(input, true));
}
@@ -243,25 +243,25 @@ public class CompoundNounAnalyzer {
WordEntry prvEntry = DictionaryUtil.getAllNoun(prev);
if(prvEntry==null) {
pSucess = analyze(prev, results, false);
- if(!pSucess) results.add(new CompoundEntry(prev, 0, false,PatternConstants.POS_NOUN));
+ if(!pSucess) results.add(new CompoundEntry(prev, false));
} else {
pSucess = true;
- if(prvEntry.getFeature(WordEntry.IDX_NOUN)=='2')
+ if(prvEntry.isCompoundNoun())
results.addAll(prvEntry.getCompounds());
else
- results.add(new CompoundEntry(prev, 0, true,PatternConstants.POS_NOUN));
+ results.add(new CompoundEntry(prev, true));
}
WordEntry rearEntry = DictionaryUtil.getAllNoun(rear);
if(rearEntry==null) {
rSuccess = analyze(rear, results, false);
- if(!rSuccess) results.add(new CompoundEntry(rear, 0, false,PatternConstants.POS_NOUN));
+ if(!rSuccess) results.add(new CompoundEntry(rear, false));
} else {
rSuccess = true;
- if(rearEntry.getFeature(WordEntry.IDX_NOUN)=='2')
+ if(rearEntry.isCompoundNoun())
results.addAll(rearEntry.getCompounds());
else
- results.add(new CompoundEntry(rear, 0, true,PatternConstants.POS_NOUN));
+ results.add(new CompoundEntry(rear, true));
}
if(!pSucess&&!rSuccess) {
@@ -365,20 +365,11 @@ public class CompoundNounAnalyzer {
* @return compound entry
*/
private CompoundEntry analyzeSingle(String input) {
-
- int score = AnalysisOutput.SCORE_ANALYSIS;
- char pos = PatternConstants.POS_NOUN;
- if(input.length()==1) return new CompoundEntry(input, 0, true,pos);
+ if(input.length()==1) return new CompoundEntry(input, true);
WordEntry entry = DictionaryUtil.getWordExceptVerb(input);
- if(entry!=null) {
- score = AnalysisOutput.SCORE_CORRECT;
- if(entry.getFeature(WordEntry.IDX_NOUN)!='1') {
- pos = PatternConstants.POS_AID;
- }
- }
- return new CompoundEntry(input, 0, score==AnalysisOutput.SCORE_CORRECT,pos);
+ return new CompoundEntry(input, entry != null);
}
private static boolean isAlphaNumeric(String text) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/IrregularUtil.java Mon Oct 21 04:22:53 2013
@@ -26,33 +26,6 @@ import org.apache.lucene.analysis.ko.dic
class IrregularUtil {
private IrregularUtil() {}
- // ã
ë¶ê·ì¹
- public static final char IRR_TYPE_BIUP = 'B';
-
- // ã
ë¶ê·ì¹
- public static final char IRR_TYPE_HIOOT = 'H';
-
- // ã¹ ë¶ê·ì¹
- public static final char IRR_TYPE_LIUL = 'U';
-
- // 르 ë¶ê·ì¹
- public static final char IRR_TYPE_LOO = 'L';
-
- // ã
ë¶ê·ì¹
- public static final char IRR_TYPE_SIUT = 'S';
-
- // ã· ë¶ê·ì¹
- public static final char IRR_TYPE_DI = 'D';
-
- // ë¬ ë¶ê·ì¹
- public static final char IRR_TYPE_RU = 'R';
-
- // ì¼ íë½
- public static final char IRR_TYPE_UI = 'X';
-
- // ê·ì¹í
- public static final char IRR_TYPE_REGULAR = 'X';
-
public static String[] restoreIrregularVerb(String start, String end) {
if(end==null) end="";
@@ -152,7 +125,7 @@ class IrregularUtil {
start = Character.toString(ch);
WordEntry entry = DictionaryUtil.getVerb(start);
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_BIUP)
+ if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_BIUP)
return new String[]{start,end};
}
@@ -186,7 +159,7 @@ class IrregularUtil {
start = Character.toString(ch);
WordEntry entry = DictionaryUtil.getVerb(start);
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_DI)
+ if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_DI)
return new String[]{start,end};
return null;
@@ -211,7 +184,7 @@ class IrregularUtil {
start = Character.toString(ch);
WordEntry entry = DictionaryUtil.getVerb(start);
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_SIUT)
+ if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_SIUT)
return new String[]{start,end};
return null;
@@ -243,7 +216,7 @@ class IrregularUtil {
sb.append(Character.toString(ch1)).append("르");
WordEntry entry = DictionaryUtil.getVerb(sb.toString());
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_LOO)
+ if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_LOO)
return new String[]{sb.toString(),end};
}
@@ -271,7 +244,7 @@ class IrregularUtil {
start = start.substring(0,start.length()-1)+convEnd;
WordEntry entry = DictionaryUtil.getVerb(start);
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_LIUL)
+ if (entry!=null && entry.getVerbType() == WordEntry.VERB_TYPE_LIUL)
return new String[]{start,end};
return null;
@@ -300,7 +273,7 @@ class IrregularUtil {
start = Character.toString(ch2);
WordEntry entry = DictionaryUtil.getVerb(start);
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_RU)
+ if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_RU)
return new String[]{start,end};
return null;
@@ -332,7 +305,7 @@ class IrregularUtil {
start = Character.toString(ch2);
WordEntry entry = DictionaryUtil.getVerb(start);
- if(entry!=null&&entry.getFeature(WordEntry.IDX_REGURA)==IRR_TYPE_HIOOT)
+ if (entry != null && entry.getVerbType() == WordEntry.VERB_TYPE_HIOOT)
return new String[]{start,end};
return null;
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/MorphAnalyzer.java Mon Oct 21 04:22:53 2013
@@ -318,7 +318,7 @@ public class MorphAnalyzer {
output.setPos(PatternConstants.POS_ETC);
output.setPatn(PatternConstants.PTN_ADVJ);
}
- if(entry.getCompounds().size()>1) output.addCNoun(entry.getCompounds());
+ if(entry.isCompoundNoun()) output.addCNoun(entry.getCompounds());
}else {
if(MorphUtil.hasVerbOnly(stem)) return;
}
@@ -351,7 +351,7 @@ public class MorphAnalyzer {
o.setPomi(pomis[1]);
WordEntry entry = DictionaryUtil.getVerb(o.getStem());
- if(entry!=null&&!("ì".equals(end)&&entry.getFeature(WordEntry.IDX_REGURA)==IrregularUtil.IRR_TYPE_LIUL)) {
+ if(entry!=null&&!("ì".equals(end)&& entry.getVerbType() == WordEntry.VERB_TYPE_LIUL)) {
AnalysisOutput output = o.clone();
output.setScore(AnalysisOutput.SCORE_CORRECT);
MorphUtil.buildPtnVM(output, candidates);
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/NounUtil.java Mon Oct 21 04:22:53 2013
@@ -231,10 +231,10 @@ class NounUtil {
WordEntry cnoun = DictionaryUtil.getAllNoun(s);
if(cnoun != null) {
- if(cnoun.getFeature(WordEntry.IDX_NOUN)=='2')
+ if(cnoun.isCompoundNoun())
output.setCNoun(cnoun.getCompounds());
else
- output.setCNoun(new ArrayList<CompoundEntry>());
+ output.setCNoun(new ArrayList<CompoundEntry>()); // TODO: dont make all these lists
output.setScore(AnalysisOutput.SCORE_CORRECT);
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/PatternConstants.java Mon Oct 21 04:22:53 2013
@@ -38,58 +38,12 @@ public interface PatternConstants {
public static int PTN_AID = 21; //* ë¨ì¼ì´ : ë¶ì¬, ê´íì¬, ê°íì¬ */
public static int PTN_ADVJ = 22; //* ë¶ì¬ + ì¡°ì¬ : '빨리ë' */
- public static int PTN_NVM = 31; //* ì²´ì¸ + ëì¬ + ì´ë¯¸ */
-
- public static int PTN_ZZZ = 35; //* 문ì¥ë¶í¸, KS ìì±í 기í¸ì´, ë¨ë
ì¡°ì¬/ì´ë¯¸ */
-
//* CLASSIFICATION OF PARTS OF SPEECH */
- // 3(basic) + 2(special) types of stem for 'pos'
- public static char POS_NPXM = 'N'; //* noun, pnoun, xn, nume */
- public static char POS_VJXV = 'V'; //* verb, adj, xverb */
+ // types of stem for 'pos'
public static char POS_AID = 'Z'; //* adv, det, excl */
- public static char POS_PUNC = 'q'; //* punctuation mark:./,/( */
- public static char POS_SYMB = 'Q'; //* special symbols */
-
// normal types of stem for 'pos2'.
- // Only some of following symbols are used.
public static char POS_NOUN = 'N'; //* noun */
- public static char POS_PNOUN = 'P'; //* pronoun */
- public static char POS_XNOUN = 'U'; //* dependent noun */
- public static char POS_NUMERAL = 'M'; //* numeral */
-
- public static char POS_PROPER = 'O'; //* proper noun: NOT USED */
-
- public static char POS_CNOUN = 'C'; //* compound noun guessed */
- public static char POS_NOUNK = 'u'; //* guessed as noun */
-
- public static char POS_ASCall = '@'; //* all alphanumeric chars*/
- public static char POS_ASCend = '$'; //* end with alphanumeric */
- public static char POS_ASCmid = '*'; //* ..+alphanumeric+Hangul*/
-
- //* defined for numeral to digit conversion */
- public static char POS_digits = '1'; //* digit-string */
- public static char POS_digitH = '2'; //* digit-string + Hangul*/
-
public static char POS_VERB = 'V'; //* verb */
- public static char POS_ADJ = 'J'; //* adjective */
- public static char POS_XVERB = 'W'; //* auxiliary verb */
- public static char POS_XADJ = 'K'; //* NOT USED YET */
-
- public static char POS_ADV = 'B'; //* adverb */
- public static char POS_DET = 'D'; //* determiner */
- public static char POS_EXCL = 'L'; //* exclamation */
-
- public static char POS_JOSA = 'j'; //* Korean Josa */
- public static char POS_COPULA = 'c'; //* copula '-Wi-' */
- public static char POS_EOMI = 'e'; //* final Ending */
- public static char POS_PEOMI = 'f'; //* prefinal Ending */
- public static char POS_NEOMI = 'n'; //* nominalizing Eomi */
-
- public static char POS_PREFIX = 'p'; //* prefixes */
- public static char POS_SFX_N = 's'; //* noun suffixes: 'ë¤/ì '*/
- public static char POS_SFX_V = 't'; //* verb suffixes: 'í/ë'*/
-
public static char POS_ETC = 'Z'; //* not decided yet */
-
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/VerbUtil.java Mon Oct 21 04:22:53 2013
@@ -101,7 +101,7 @@ class VerbUtil {
WordEntry entry = null;
if(success&&(entry=DictionaryUtil.getAllNoun(o.getStem()))!=null) {
- if(entry.getFeature(WordEntry.IDX_NOUN)=='2') {
+ if(entry.isCompoundNoun()) {
o.setCNoun(entry.getCompounds());
}
o.setScore(AnalysisOutput.SCORE_CORRECT);
@@ -191,7 +191,7 @@ class VerbUtil {
if(o.getVsfx().equals("ë")&&entry.getFeature(WordEntry.IDX_BEV)!='1') return false;
o.setScore(AnalysisOutput.SCORE_CORRECT);
- if(entry.getFeature(WordEntry.IDX_NOUN)=='2') {
+ if(entry.isCompoundNoun()) {
o.setCNoun(entry.getCompounds());
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordEntry.java Mon Oct 21 04:22:53 2013
@@ -22,16 +22,37 @@ import java.util.List;
public class WordEntry {
- public static final int IDX_NOUN = 0;
- public static final int IDX_VERB = 1;
- public static final int IDX_BUSA = 2;
+ static final int IDX_NOUN = 0;
+ static final int IDX_VERB = 1;
+ static final int IDX_BUSA = 2;
public static final int IDX_DOV = 3;
public static final int IDX_BEV = 4;
public static final int IDX_NE = 5;
- public static final int IDX_ADJ = 6; // íì©ì¬
- public static final int IDX_NPR = 7; // ëª
ì¬ì ë¶ë¥ (M:Measure)
- public static final int IDX_CNOUNX = 8;
- public static final int IDX_REGURA = 9;
+ static final int IDX_REGURA = 9;
+
+ /** Irregular verb type (ã
-final) */
+ public static final int VERB_TYPE_BIUP = 'B';
+
+ /** Irregular verb type (ã
-final) */
+ public static final int VERB_TYPE_HIOOT = 'H';
+
+ /** Irregular verb type (ã¹-final) */
+ public static final int VERB_TYPE_LIUL = 'U';
+
+ /** Irregular verb type (르-final) */
+ public static final int VERB_TYPE_LOO = 'L';
+
+ /** Irregular verb type (ã
-final) */
+ public static final int VERB_TYPE_SIUT = 'S';
+
+ /** Irregular verb type (ã·-final) */
+ public static final int VERB_TYPE_DI = 'D';
+
+ /** Irregular verb type (ë¬-final) */
+ public static final int VERB_TYPE_RU = 'R';
+
+ /** Regular verb type */
+ public static final int VERB_TYPE_REGULAR = 'X';
/**
* ë¨ì´
@@ -45,21 +66,17 @@ public class WordEntry {
private final List<CompoundEntry> compounds;
- public WordEntry(String word) {
- this(word, null);
- }
-
- public WordEntry(String word, char[] cs) {
- this(word, cs, Collections.<CompoundEntry>emptyList());
- }
-
public WordEntry(String word, char[] cs, List<CompoundEntry> compounds) {
if (cs.length != 10) {
throw new IllegalArgumentException("invalid features for word: " + word + ", got:" + new String(cs));
}
this.word = word;
this.features = cs;
- this.compounds = Collections.unmodifiableList(compounds);
+ this.compounds = compounds == null ? null : Collections.unmodifiableList(compounds);
+ // has compound list iff compound feature is set ('2' in main dictionary, '9' in uncompounds)
+ // TODO: implement validCompound check differently: uncompounds shouldnt use wordentry
+ assert (features[IDX_NOUN] >= '2' && compounds != null && compounds.size() > 1)
+ || (features[IDX_NOUN] <= '2' && compounds == null) : "inconsistent compound data for word: " + word;
}
public String getWord() {
@@ -67,15 +84,37 @@ public class WordEntry {
}
public char getFeature(int index) {
- if(features==null||features.length<index) return '0';
return features[index];
}
- public char[] getFeatures() {
- return this.features;
+ /** Returns true if the entry is a noun (or compound noun) */
+ public boolean isNoun() {
+ return features[IDX_NOUN] != '0';
}
+ /** Returns true if entry is a compound noun */
+ public boolean isCompoundNoun() {
+ return features[IDX_NOUN] >= '2';
+ }
+
+ /** Returns List of compounds for word */
public List<CompoundEntry> getCompounds() {
- return this.compounds;
+ assert isCompoundNoun();
+ return compounds;
+ }
+
+ /** Returns true if entry is verb */
+ public boolean isVerb() {
+ return features[IDX_VERB] == '1';
+ }
+
+ /** Returns verb type (IRR_TYPE_REGULAR or irregular type) */
+ public int getVerbType() {
+ return features[IDX_REGURA];
+ }
+
+ /** Returns true if entry is busa (adverb) */
+ public boolean isAdverb() {
+ return features[IDX_BUSA] == '1';
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/morph/WordSpaceAnalyzer.java Mon Oct 21 04:22:53 2013
@@ -345,12 +345,12 @@ public class WordSpaceAnalyzer {
List<CompoundEntry> cnouns = o.getCNounList();
if(cnouns.size()==0) {
boolean is = DictionaryUtil.getWordExceptVerb(pvword)!=null;
- cnouns.add(new CompoundEntry(pvword,0,is));
+ cnouns.add(new CompoundEntry(pvword, is));
}
for(AnalysisOutput candidate : candidates) {
candidate.getCNounList().addAll(cnouns);
- candidate.getCNounList().add(new CompoundEntry(candidate.getStem(),0,true));
+ candidate.getCNounList().add(new CompoundEntry(candidate.getStem(), true));
candidate.setStem(pvword+candidate.getStem()); // ì´ë ê² í´ì¼ WSOutput ì ë³µí©ëª
ì¬ ì²ë¦¬í ë ì ìì²ë¦¬ë¨
}
@@ -444,10 +444,7 @@ public class WordSpaceAnalyzer {
AnalysisOutput o = candidates.remove(0);
AnalysisOutput po = output.getPhrases().size()>0 ? output.getPhrases().get(output.getPhrases().size()-1) : null;
- String ejend = o.getSource().substring(o.getStem().length());
-
- char[] chrs = po!=null&&po.getStem().length()>0 ? MorphUtil.decompose(po.getStem().charAt(po.getStem().length()-1)) : null;
- String pjend = po!=null&&po.getStem().length()>0 ? po.getSource().substring(po.getStem().length()) : null;
+ String ejend = o.getSource().substring(o.getStem().length());
char ja = 'x'; // ììì 문ì
if(po!=null&&(po.getPatn()==PatternConstants.PTN_VM||po.getPatn()==PatternConstants.PTN_VMCM||po.getPatn()==PatternConstants.PTN_VMXM)) {
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/compounds.dic Mon Oct 21 04:22:53 2013
@@ -70,10 +70,6 @@
ê²°ì문:ê²°ì,문:0000
ê²°ìì:ê²°ì,ì:0000
ê²°í¼ì:ê²°í¼,ì:1000
-ê²½ìì¸:ê²½ìì¸:0000
-ê²½ìì:ê²½ìì:0000
-ê²½ì주:ê²½ì주:0000
-ê²½ìì§:ê²½ìì§:0000
ê³ êµì°ë§¹ì :ê³ êµ,ì°ë§¹ì :0000
골íì¥:골í,ì¥:0000
ê³°ë¹ëê³ ì¬ë¦¬:ê³°ë¹ë,ê³ ì¬ë¦¬:0000
@@ -99,26 +95,21 @@
ê³ í¬ìì¨:ê³ ,í¬ì,ì¨:0000
ê²½ì ì² :ê²½,ì ì² :0000
ê°ê¹ì¹:ê°,ê¹ì¹:0000
-ê°ì ê³ ë¦¬:ê°ì ê³ ë¦¬:0000
ë°¤íë:ë°¤,íë:0000
ê°ê°ì¸:ê°,ê°ì¸:0000
ê°ê³ 기:ê°,ê³ ê¸°:0000
ê°ê³ ì¬ë¦¬:ê°,ê³ ì¬ë¦¬:0000
-ê°ê´ì :ê°ê´ì :0000
ê°êµ¬ë¦¬ë:ê°êµ¬ë¦¬,ë:0000
ê°êµ¬ë¦¬ë°¥:ê°êµ¬ë¦¬,ë°¥:0000
ê°ê·¸ë§¨:ê°ê·¸,맨:0000
ê°ê¸°ìì:ê°ê¸°,ìì:0000
ê°ê¸°ì¼ì:ê°ê¸°,ì¼ì:0000
ê°ë
í:ê°ë
,í:1100
-ê°ë¡ìì ë¹:ê°ë¡ìì ë¹:0000
ê°ë¥ë²ë :ê°ë¥,ë²ë :0000
ê°ë¥ì§ë¹ ê·:ê°ë¥,ì§ë¹ ê·:0000
ê°ë§ì:ê°ë§,ì:0000
-ê°ë§ì :ê°ë§ì :1000
ê°ë¨¸ë£¨ë©êµ´:ê°ë¨¸ë£¨,ë©êµ´:0000
ê°ë¯¸êµ´:ê°ë¯¸,êµ´:0000
-ê°ë°ëìêµ:ê°ë°ëìêµ:0000
ê°ë°ì:ê°ë°,ì:0000
ê°ë°¥ëë:ê°,ë°¥,ëë:0000
ê°ë³í:ê°ë³,í:0000
@@ -128,7 +119,6 @@
ê°ì 문:ê°ì ,문:0000
ê°ì ì:ê°ì ,ì:0000
ê°ì±ì :ê°ì±,ì :0000
-ê°ì°ì±:ê°ì°ì±:0000
ê°ì¸ê°:ê°ì¸,ê°:0000
ê°ì¸ë¬¼:ê°ì¸,물:0000
ê°ì ë ¹:ê°ì ,ë ¹:0000
@@ -143,7 +133,6 @@
ê°ê´ì±:ê°ê´,ì±:0000
ê°ê´ì :ê°ê´,ì :0000
ê°ê´í:ê°ê´,í:1100
-ê°ì§ì´ì´:ê°ì§ì´ì´:1000
ê°¯ë§ì:ê°¯,ë§ì:0000
ê°¯ë°ì:ê°¯,ë°ì:0000
ê°¯ë²ë¤:ê°¯,ë²ë¤:0000
@@ -158,7 +147,6 @@
거족ì :거족,ì :0000
거주ì§:거주,ì§:0000
ê±°ì§ë°:ê±°ì§,ë°:0000
-ê±°ì§ë§:ê±°ì§ë§:1000
ê±°ì§ë§ìì´:ê±°ì§,ë§,ìì´:0000
ê±°ì§ë¶ë ê±°ì§ë¶ë :ê±°ì§,ë¶ë :0000
ê±±ì 거리:ê±±ì ,거리:0000
@@ -188,7 +176,6 @@
ê²°ë²½ì¦:ê²°ë²½,ì¦:0000
ê²°ì¬ì :ê²°ì¬,ì :0000
ê²¹ê°íµ:ê²¹,ê°íµ:1000
-ê²¹ì¬ë:ê²¹ì¬ë:0000
ê²½ê°ì¬:ê²½ê°,ì¬:0000
ê²½ê³ì :ê²½ê³,ì :0000
ê²½ê³ì¬:ê²½ê³,ì¬:0000
@@ -562,7 +549,6 @@
ê¸ë©ë¬:ê¸,ë©ë¬:0000
ê¸ë°ì§:ê¸,ë°ì§:0000
ê¸ë¹ëë¹ë:ê¸ë¹,ë,ë¹ë:0000
-ê¸ìì±:ê¸ìì±:0000
ê¸ì주ìì:ê¸ì,주ìì:0000
ê¸ìµê³:ê¸ìµ,ê³:0000
ê¸ìµì
:ê¸ìµ,ì
:0000
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic?rev=1534021&r1=1534020&r2=1534021&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/extension.dic Mon Oct 21 04:22:53 2013
@@ -5338,4 +5338,19 @@
íì¤ì¹,100110000X
íí¬í¬ë¼í
ì¤,100110000X
íí¼ì¤í¸ë¼,100110000X
-íì§,100110000X
\ No newline at end of file
+íì§,100110000X
+! compound exceptions
+ê²½ìì¸,100000000X
+ê²½ìì,100000000X
+ê²½ì주,100000000X
+ê²½ìì§,100000000X
+ê°ì ê³ ë¦¬,100000000X
+ê°ê´ì ,100000000X
+ê°ë¡ìì ë¹,100000000X
+ê°ë§ì ,100100000X
+ê°ë°ëìêµ,100000000X
+ê°ì°ì±,100000000X
+ê°ì§ì´ì´,100100000X
+ê±°ì§ë§,100100000X
+ê²¹ì¬ë,100000000X
+ê¸ìì±,100000000X